object-introspection/tools/inline_stats.py
2022-12-19 06:37:51 -08:00

368 lines
11 KiB
Python
Executable File

#
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import atexit
import gc
import json
import re
import signal
import sqlite3
import subprocess
import sys
from collections import defaultdict
from pathlib import Path
import click
from elftools.common.py3compat import bytes2str
from elftools.elf.elffile import ELFFile
GC_COUNTER_INIT = 0
EXCLUDE_PATTERN = None
DB = None
# README:
# This script depends on the python library pyelftools: https://github.com/eliben/pyelftools
# Please make sure you have it installed before trying it out.
# For internal use, please see: https://fburl.com/code/1f6a4vjj
@atexit.register
def close_db():
"""
Ensure that the database is properly closed before exiting
"""
if DB:
DB.commit()
DB.close()
@click.command(context_settings={"help_option_names": ["-h", "--help"]})
@click.option(
"--gc",
type=int,
default=10_000,
help="After how many CU memory is manually freed. Low values reduce performance. High values can cause OOM.",
)
@click.option(
"-o",
"--output",
type=click.Path(),
default="instats.db",
help="Name of the output SQLite3 database",
)
@click.option(
"-c/ ",
"--clear-db/--no-clear-db",
default=False,
help="Delete the database content on start",
)
@click.option(
"-e",
"--exclude",
default=r".^", # This pattern matches nothing, so no exclude by default
help="Exclude CUs whose path match the given pattern",
)
@click.option(
"-E",
"--exclude-file",
default=r".^", # This pattern matches nothing, so no exclude by default
help="Exclude files whose path match the given pattern",
)
@click.option(
"-S/ ",
"--follow-shared/--no-follow-shared",
default=False,
help="Traverse the shared library linked against the INPUTS",
)
@click.argument(
"inputs", nargs=-1, type=click.Path(exists=True)
) # help="Executable file to analyse"
def cli(gc, output, clear_db, exclude, exclude_file, follow_shared, inputs):
"""
Traverses the DWARF data of the INPUTS and count how many arguments have location info.
It counts for both function calls and inlined functions.
The results are stored in a SQLite3 database for convenient data analysis.
"""
global GC_COUNTER_INIT
GC_COUNTER_INIT = gc
global EXCLUDE_PATTERN
EXCLUDE_PATTERN = re.compile(exclude)
exclude_file_pattern = re.compile(exclude_file)
ldd_pattern = re.compile(r".* => (.*) \(.*\)")
create_db(output, clear_db)
inputs = list(inputs)
processed_files = set()
for filename in inputs:
if exclude_file_pattern.search(filename) or filename in processed_files:
continue
if follow_shared:
ldd = subprocess.run(["ldd", filename], capture_output=True, text=True)
shared_libraries = ldd_pattern.findall(ldd.stdout)
inputs.extend(shared_libraries)
processed_files.add(filename)
process_file(filename)
def create_db(dbfilename, clear_db=False):
global DB
DB = sqlite3.connect(dbfilename)
DB.executescript(
"""
CREATE TABLE IF NOT EXISTS call_sites (
function_name TEXT PRIMARY KEY,
count INT DEFAULT 1
);
CREATE TABLE IF NOT EXISTS call_site_arguments (
function_name TEXT NOT NULL REFERENCES call_sites(function_name),
argument_name TEXT NOT NULL,
count INT DEFAULT 1,
loc_count INT DEFAULT 0,
PRIMARY KEY (function_name, argument_name)
);
CREATE TABLE IF NOT EXISTS inlined_subprograms (
function_name TEXT PRIMARY KEY,
count INT DEFAULT 1
);
CREATE TABLE IF NOT EXISTS inlined_subprogram_arguments (
function_name TEXT NOT NULL REFERENCES inlined_subprograms(function_name),
argument_name TEXT NOT NULL,
count INT DEFAULT 1,
loc_count INT DEFAULT 0,
PRIMARY KEY (function_name, argument_name)
);
"""
)
if clear_db:
DB.executescript(
"""
DELETE FROM inlined_subprogram_arguments;
DELETE FROM inlined_subprograms;
DELETE FROM call_site_arguments;
DELETE FROM call_sites;
"""
)
return DB
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
def process_file(filename):
eprint("Processing file: ", filename)
with open(filename, "rb") as f:
elffile = ELFFile(f)
if not elffile.has_dwarf_info():
eprint(" file has no DWARF info")
return
dwarfinfo = elffile.get_dwarf_info()
gc_counter = GC_COUNTER_INIT
for CU in dwarfinfo._parse_CUs_iter():
top_DIE = CU.get_top_DIE()
top_DIE_path = Path(top_DIE.get_full_path()).as_posix()
if EXCLUDE_PATTERN.search(top_DIE_path):
eprint("Skipping", top_DIE_path, top_DIE.tag, top_DIE.offset)
continue
else:
eprint(top_DIE_path, top_DIE.tag, top_DIE.offset, CU.cu_offset)
process_die_rec(CU.get_top_DIE())
# Clear cache when counter reaches 0, in an attempt to limit memory usage
gc_counter -= 1
if gc_counter <= 0:
gc_counter = GC_COUNTER_INIT
sys.stderr.flush()
DB.commit()
dwarfinfo._cu_offsets_map.clear()
dwarfinfo._cu_cache.clear()
gc.collect()
def process_die_rec(die):
# Start by listing all subroutines we find
if die.tag == "DW_TAG_subprogram":
process_subprogram(die)
else:
for child in die.iter_children():
process_die_rec(child)
def process_subprogram(die):
# We can have subprograms that contains only a typedef
# This might occur when the typedef is within a subprogram
# What's weird is I haven't seen a reference to the original subprogram
if not "DW_AT_name" in die.attributes:
return
# eprint("Subprogram: ", get_name(die))
for child in die.iter_children():
process_subprogram_rec(child)
def process_subprogram_rec(die):
# Look for DW_TAG_inlined_subroutine and DW_TAG_GNU_call_site
if die.tag == "DW_TAG_inlined_subroutine":
process_inlined_subroutine(die)
elif die.tag == "DW_TAG_GNU_call_site":
process_call_site(die)
else:
for child in die.iter_children():
process_subprogram_rec(child)
def collect_arguments(function_name, function_die, table):
"""
Insert in the database all the arguments of the given function.
We might not know that some argument are missing if they never appear in
an inlined subprogram or call site. By collecting all the arguments, we
make sure we are aware of this blind spot.
"""
for child in function_die.iter_children():
aname = None
if child.tag == "DW_TAG_formal_parameter":
aname = get_name(child)
elif child.tag == "DW_TAG_GNU_call_site_parameter":
aname = get_name(child)
else:
continue
DB.execute(
f"INSERT INTO {table} VALUES (?, ?, 0, 0) ON CONFLICT(function_name, argument_name) DO NOTHING",
(function_name, aname),
)
def process_inlined_subroutine(die):
fname = get_name(die)
DB.execute(
"""
INSERT INTO inlined_subprograms VALUES (?, 1)
ON CONFLICT(function_name) DO
UPDATE SET count = count + 1
""",
(fname,),
)
if "DW_AT_abstract_origin" in die.attributes:
collect_arguments(
fname,
die.get_DIE_from_attribute("DW_AT_abstract_origin"),
"inlined_subprogram_arguments",
)
for child in die.iter_children():
if child.tag == "DW_TAG_inlined_subroutine":
process_inlined_subroutine(child)
elif child.tag == "DW_TAG_formal_parameter":
aname = get_name(child)
DB.execute(
"""
INSERT INTO inlined_subprogram_arguments VALUES (?, ?, 1, ?)
ON CONFLICT(function_name, argument_name) DO UPDATE SET
count = count + 1,
loc_count = loc_count + excluded.loc_count
""",
(fname, aname, int("DW_AT_location" in child.attributes)),
)
def process_call_site(die):
fname = get_name(die)
DB.execute(
"""
INSERT INTO call_sites VALUES (?, 1)
ON CONFLICT(function_name) DO
UPDATE SET count = count + 1
""",
(fname,),
)
if "DW_AT_abstract_origin" in die.attributes:
collect_arguments(
fname,
die.get_DIE_from_attribute("DW_AT_abstract_origin"),
"call_site_arguments",
)
for child in die.iter_children():
if child.tag == "DW_TAG_inlined_subroutine":
process_inlined_subroutine(child)
elif child.tag == "DW_TAG_formal_parameter":
aname = get_name(child)
DB.execute(
"""
INSERT INTO call_site_arguments VALUES (?, ?, 1, ?)
ON CONFLICT(function_name, argument_name) DO UPDATE SET
count = count + 1,
loc_count = loc_count + excluded.loc_count
""",
(fname, aname, int("DW_AT_location" in child.attributes)),
)
elif child.tag == "DW_TAG_GNU_call_site_parameter":
aname = get_name(child)
DB.execute(
"""
INSERT INTO call_site_arguments VALUES (?, ?, 1, ?)
ON CONFLICT(function_name, argument_name) DO UPDATE SET
count = count + 1,
loc_count = loc_count + excluded.loc_count
""",
(fname, aname, int("DW_AT_location" in child.attributes)),
)
def print_die_attributes(die):
for name, attr in die.attributes.items():
eprint(" %s = %s" % (name, attr.value))
def get_name(die):
# We work mainly with mangled C++ name, so look for the linkage name first
if "DW_AT_linkage_name" in die.attributes:
return bytes2str(die.attributes["DW_AT_linkage_name"].value)
elif "DW_AT_name" in die.attributes:
return bytes2str(die.attributes["DW_AT_name"].value)
elif "DW_AT_abstract_origin" in die.attributes:
return get_name(die.get_DIE_from_attribute("DW_AT_abstract_origin"))
elif "DW_AT_specification" in die.attributes:
return get_name(die.get_DIE_from_attribute("DW_AT_specification"))
elif "DW_AT_type" in die.attributes:
return get_name(die.get_DIE_from_attribute("DW_AT_type"))
else:
# print_die_attributes(die)
return "<unknown>"
if __name__ == "__main__":
cli()