SOAdvancedDissector/SOAdvancedDissector.py
2021-05-08 18:57:54 +02:00

729 lines
24 KiB
Python
Executable File

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright Grégory Soutadé
# This file is part of SOAdvancedDissector
# SOAdvancedDissector is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# SOAdvancedDissector is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with SOAdvancedDissector. If not, see <http://www.gnu.org/licenses/>.
#
import re
import os
import shutil
import struct
import sys
import subprocess
import argparse
from objects import *
from cppprototypeparser import CPPPrototypeParser
from display import ProgressionDisplay
#
# Regexp for readelf -sW|c++filt
#
# 6: 006f25d0 20 OBJECT WEAK DEFAULT 20 vtable for dpdoc::Annot
num_re = '(?P<num>[0-9]+\:)'
address_re = '(?P<address>[0-9a-f]+)'
size_re = '(?P<size>[0-9]+)'
ustr_re = '(?P<{}>[A-Z]+)'
type_re = ustr_re.format('type')
link_re = ustr_re.format('link')
visibility_re = ustr_re.format('visibility')
ndx_re = '(?P<ndx>[0-9]+)'
name_re = '(?P<name>.*)'
readelf = r'[ ]+{}[ ]+{}[ ]+{}[ ]+{}[ ]+{}[ ]+{}[ ]+{}[ ]+{}'
readelf = readelf.format(num_re, address_re, size_re, type_re, link_re, visibility_re,ndx_re,name_re)
readelf_re = re.compile(readelf)
#
# Regexp for readelf --sections|c++filt
#
# [ 0] NULL 00000000 000000 000000 00 0 0 0
# [ 1] .interp PROGBITS 00000154 000154 000019 00 A 0 0 1
num_re = '(?P<num>\[[ ]*[0-9]+\])'
name_re = '(?P<name>.+)'
type_re = ustr_re.format('type')
address_re = '(?P<address>[0-9a-f]+)'
offset_re = '(?P<offset>[0-9a-f]+)'
size_re = '(?P<size>[0-9a-f]+)'
sections = r'[ ]+{}[ ]+{}[ ]+{}[ ]+{}[ ]+{}[ ]+{}[ ]+.*'
sections = sections.format(num_re, name_re, type_re, address_re, offset_re, size_re)
sections_re = re.compile(sections)
#
# Regexp for vtable-dumper --demangle|c++filt
#
# 0 0000000000000000
# 4 006e9fd0 (& typeinfo for zip::EditableStream)
# Inherit from dputils::GuardedStream
# Inherit from dpio::StreamClient
# 8 002a793d zip::EditableStream::~EditableStream()
dynvtable_idx_re = re.compile(r'(?P<index>[0-9]+)[ ]+(?P<vtable_index>[-]?0[x]?[0-9a-f]+)')
dynvtable_entry_re = re.compile(r'(?P<index>[0-9]+)[ ]+(?P<address>[0-9a-f]+) (?P<name>.*)')
dynvtable_inherit_re = re.compile(r'Inherit from (?P<inherit>.*)')
#
# Global variables
#
global_namespace = Namespace('global')
namespaces = {'global':global_namespace} # Root namespace with one (without namespace) 'global'
matched_lines = [] # Matched lines from readelf symbols output
sections_lines = [] # Matched lines from readelf sections output
sections_addr = [] # Contains tuple (section_addr_start, section_addr_end, section_offset)
address_size = 4 # Target address size (32bits by default)
classes_with_inheritance = []
namespace_dependencies = {}
display = ProgressionDisplay()
def line_count(filename):
"""Do 'wc -l <filename>'
Returns
-------
int
Line count of filename
"""
try:
return int(subprocess.check_output(['wc', '-l', filename]).split()[0])
except:
return 0
def findBinOffset(addr):
"""Find offset into binary file from target address
Sections must have been extracted
Parameters
----------
addr : int
Target address
Returns
-------
int
Offset or None if not found
"""
for (start, end, offset) in sections_addr:
if addr >= start and addr <= end:
return (addr - start) + offset
return None
def funcnameFromProtype(fullname):
"""Return function name (name + parameters)
from prototype (includes namespaces + base class)
Parameters
----------
fullname : str
Full function name (package::NCXStreamReceiver::totalLengthReady(unsigned int))
Returns
-------
str
Function name + parameters
"""
if fullname.startswith('typeinfo'):
return ('typeinfo()', '')
parser = CPPPrototypeParser(fullname)
return (parser.funcname + parser.fullparameters, '::'.join(parser.namespaces))
def findObjectInCache(name):
"""Find class object in namespaces
Parameters
----------
name : str
Full class name (package::NCXStreamReceiver)
Returns
-------
obj
Found class or None
"""
parser = CPPPrototypeParser(name)
if parser.is_function:
return global_namespace.child(parser.funcname)
if not parser.namespaces:
return global_namespace.child(parser.classname)
if not parser.namespaces[0] in namespaces.keys():
return None
namespace = namespaces[parser.namespaces[0]]
# Don't directly use find on root to avoid duplicate name in sub namespaces
# eg : ns0::ns1::ns2::ns0::ns3::func
for targetNamespace in parser.namespaces[1:]:
namespace = namespace.find(targetNamespace)
if not namespace: return None
# print('findObjectInCache({}) --> {}'.format(name, namespace.name))
return namespace.find(parser.classname)
def findObjectFromAddr(addr):
"""Find object from address (in readelf output)
Parameters
----------
addr : int
Object address
Returns
-------
obj
Matched object or None
"""
for match in matched_lines:
if int(match.group('address'),16) == addr:
#print(match.groups())
return match
return None
def createClass(fullname):
"""Find class object in namespaces or create
it if it doesn't exists
Parameters
----------
name : str
Full class name (package::NCXStreamReceiver)
Returns
-------
obj
Found class or created one
"""
parser = CPPPrototypeParser(fullname)
class_ = Class(parser.classname, '::'.join(parser.namespaces))
if not parser.namespaces:
global_namespace.addChild(class_)
else:
if not parser.namespaces[0] in namespaces.keys():
lastNamespace = Namespace(parser.namespaces[0])
namespaces[parser.namespaces[0]] = lastNamespace
else:
lastNamespace = namespaces[parser.namespaces[0]]
for name in parser.namespaces[1:]:
newNamespace = lastNamespace.child(name)
if not newNamespace:
newNamespace = Namespace(name)
lastNamespace.addChild(newNamespace)
lastNamespace = newNamespace
lastNamespace.addChild(class_)
return class_
def parseDynamicVtable(filename):
"""Parse dynamic vtable (vtable-dumper) file
and fix static vtable
Parameters
----------
filename : str
Filename to parse
"""
display.setTarget(' * Parse dynamic vtable', line_count(filename), 10)
curObj = None
with open(filename, 'r') as fd:
for line in fd.readlines():
display.progress(1)
# Empty line -> end of current class
line = line.strip()
if len(line) == 0:
curObj = None
continue
if curObj is None:
# New vtable
if not line.startswith('Vtable for '): continue
objname = line[len('Vtable for '):]
curObj = findObjectInCache(objname)
else:
# First, try object vtable entry
match = dynvtable_entry_re.match(line)
if match:
idx = int(match.group('index'))
func = None
if match.group('name') == '__cxa_pure_virtual':
func = Function('virtfunc{}()'.format(idx), 0, True, True)
else:
funcaddr = int(match.group('address'),16)
match = findObjectFromAddr(funcaddr)
funcname = 'unknown_virtfunc{}()'.format(idx)
funcnamespaces = ''
if match:
(funcname, funcnamespaces) = funcnameFromProtype(match.group('name'))
else:
sys.stderr.write('Error dynvtable0, no match for {}\n'.format(hex(funcaddr)))
func = Function(funcname, funcaddr, virtual=True, namespace=funcnamespaces)
curObj.updateVirtualFunction(int(idx/address_size), func)
continue
# Index vtable entry
match = dynvtable_idx_re.match(line)
if match:
funcaddr = int(match.group('vtable_index'),16)
funcname = 'vtable_index{}'.format(-funcaddr)
func = Function(funcname, funcaddr, True)
curObj.updateVirtualFunction(int(int(match.group('index'))/address_size), func)
continue
# Inherit entry
match = dynvtable_inherit_re.match(line)
if match:
basename = match.group('inherit')
base = findObjectInCache(basename)
if not base:
base = createClass(basename)
curObj.addBaseClass(base)
classes_with_inheritance.append(curObj)
continue
sys.stderr.write('Error dynvtable, no match for {}\n'.format(line))
sys.stderr.flush()
display.finish()
def fixupInheritance():
"""Fix inheritance : each class reports implemented pure virtual function
into its base(s)
"""
global classes_with_inheritance
display.setTarget(' * Fixup inheritance', len(classes_with_inheritance))
for class_ in classes_with_inheritance:
display.progress(1)
class_.fixupInheritance()
display.finish()
classes_with_inheritance = None # Release memory
def parseStaticVtable(binfile):
"""Parse vtable vtable object into binary file
and create static vtable entries
Parameters
----------
binfile : str
Filename of binary file to inspect objects
"""
display.setTarget(' * Parse static vtable', len(matched_lines), 10)
with open(binfile, 'rb') as fd:
for match in matched_lines:
display.progress(1)
name = match.group('name')
if not name.startswith('vtable for'): continue
address = int(match.group('address'), 16)
# vtable for mtext::cts::ListOfGlyphRunsCTS
classname = name[len('vtable for '):]
class_ = findObjectInCache(classname)
if class_ is None:
class_ = createClass(classname)
binaddr = findBinOffset(address)
fd.seek(binaddr)
nb_funcs = int(int(match.group('size'))/address_size)
#print('vtable {} {} funcs'.format(classname, nb_funcs))
fd.seek(binaddr)
for i in range(0, nb_funcs):
funcaddr = 0
if address_size == 4:
funcaddr, = struct.unpack('<I', fd.read(address_size))
elif address_size == 8:
funcaddr, = struct.unpack('<Q', fd.read(address_size))
func = None
if funcaddr == 0:
# Address == 0 --> pure virtual
func = Function('virtfunc{}()'.format(i), 0, True, True)
else:
funcname = ''
funcnamespaces = ''
if funcaddr == 0 or hex(funcaddr).startswith('0xf'): # Negative address
funcname = 'vtable_index{}'.format(-funcaddr)
else:
match = findObjectFromAddr(funcaddr)
try:
if not match:
sys.stderr.write('No func found at : {}\n'.format(hex(funcaddr)))
(funcname, funcnamespaces) = funcnameFromProtype(match.group('name'))
except:
if match:
sys.stderr.write('FFP except : {}'.format(match.group('name')))
funcname = 'unknown_virtfunc{}()'.format(i)
func = Function(funcname, funcaddr, virtual=True, namespace=funcnamespaces)
try:
# print('Add virt {}'.format(func))
class_.addVirtualFunction(func)
except:
print(match.group('name'))
print(class_)
raise
display.finish()
def parseTypeinfo():
"""Parse typeinfo objects in matched_lines
and create classes
"""
typeinfos = []
for match in matched_lines:
name = match.group('name')
# "typeinfo for css::MediaParser"
if not name.startswith('typeinfo for'): continue
typeinfos.append(name[len('typeinfo for '):])
# print(name)
# Sort array before creating class in order to have the right
# class hierarchy
for classname in sorted(typeinfos):
#print(classname)
createClass(classname)
def parseSymbolFile(filename):
"""Parse readelf symbols output file
and fill matched_lines
Parameters
----------
filename : str
Filename to parse
"""
display.setTarget(' * Parse symbols file', line_count(filename), 10)
with open(filename, 'r') as fd:
for line in fd.readlines():
display.progress(1)
line = line.rstrip()
if not line: continue
match = readelf_re.match(line)
if not match: continue
if match.group('type') not in ('FUNC', 'OBJECT'):
continue
if match.group('link') not in ('GLOBAL', 'WEAK'):
continue
if match.group('visibility') not in ('DEFAULT'):
continue
matched_lines.append(match)
if matched_lines:
address_size = int(len(matched_lines[0].group('address'))/2)
display.finish()
def parseSectionFile(filename):
"""Parse readelf sections output file
and fill sections_lines and sections_addr
Parameters
----------
filename : str
Filename to parse
"""
# We assume there is about 20 sections
display.setTarget(' * Parse sections file (1/2)', line_count(filename))
with open(filename, 'r') as fd:
for line in fd.readlines():
display.progress(1)
line = line.rstrip()
if not line: continue
match = sections_re.match(line)
if not match: continue
sections_lines.append(match)
display.finish()
display.setTarget(' * Parse sections file (2/2)', line_count(filename))
for match in sections_lines:
display.progress(1)
start = int(match.group('address'), 16)
size = int(match.group('size'), 16)
offset = int(match.group('offset'), 16)
sections_addr.append((start, start+size, offset))
display.finish()
def addAllMembers():
"""Add all other members that have not been computed
"""
display.setTarget(' * Add all members', len(matched_lines), 10)
for match in matched_lines:
display.progress(1)
virtual = False
funcaddr = int(match.group('address'),16)
name = match.group('name')
if name.startswith('typeinfo') or\
name.startswith('vtable'):
continue
if name.startswith('non-virtual thunk to '):
name = name[len('non-virtual thunk to '):]
virtual = True
parser = CPPPrototypeParser(name)
class_ = None
obj = None
funcname = ''
classname = ''
if match.group('type') == 'FUNC':
classname = parser.fullclassname
# C functions doesn't have () in their name
if not '(' in name:
obj = Function(name + '()', funcaddr)
global_namespace.addChild(obj)
continue
else:
(funcname, funcnamespaces) = funcnameFromProtype(name)
obj = Function(funcname, funcaddr, virtual=virtual, namespace=funcnamespaces)
# No classname : add into global namespace
if not classname:
global_namespace.addChild(obj)
continue
class_ = findObjectInCache(classname)
else: # Object
if parser.funcname:
obj = Attribute(parser.funcname, funcaddr)
classname = parser.classname
elif parser.classname:
obj = Attribute(parser.classname, funcaddr)
# No namespaces : add into global namespace
if not parser.namespaces:
global_namespace.addChild(obj)
continue
classname = '::'.join(parser.namespaces)
class_ = findObjectInCache(parser.fullclassname)
# Try to search in namespaces from C++ method
if not class_ and parser.namespaces:
class_ = findObjectInCache('::'.join(parser.namespaces))
# Try to search in namespaces from C function
if not class_ and classname in namespaces.keys():
class_ = namespaces[classname]
# If still not, it's a new class/function
if not class_:
if not classname:
sys.stderr.write('AAM Err3 "{}" "{}"\n'.format(name, funcname))
continue
else:
class_ = createClass(classname)
try:
# Could be class or namespace
class_.addChild(obj)
except:
sys.stderr.write('Not class {} {}\n'.format(name, class_.name))
sys.stderr.flush()
# raise
display.finish()
def fixBadClassAssertion():
"""We could have consider obj from a class and created it,
but in fact, it just namespace
"""
toDelete = []
toAdd = []
display.setTarget(' * Fix bad class assertion (1/2)', len(global_namespace.childs))
for obj in global_namespace.childs:
display.progress(1)
if type(obj) == Class and obj.looksLikeNamespace():
if obj.name in namespaces:
newNamespace = namespaces[obj.name]
else:
newNamespace = Namespace(obj.name)
newNamespace.fillFrom(obj)
toAdd.append(newNamespace)
toDelete.append(obj)
display.finish()
display.setTarget(' * Fix bad class assertion (2/2)', len(toDelete)+len(toAdd))
for obj in toDelete:
display.progress(1)
global_namespace.removeChild(obj)
for obj in toAdd:
display.progress(1)
global_namespace.addChild(obj)
display.finish()
def analyseDependencies():
"""Find classes present in method parameters but
not previously found
"""
display.setTarget(' * Analyse dependencies (1/2)', len(namespaces.keys()))
allParams = []
for namespace in namespaces.values():
display.progress(1)
params = namespace.getDependencies()
for param in params:
allParams.append(param)
display.finish()
if allParams:
allParams = list(set(allParams))
display.setTarget(' * Analyse dependencies (2/2)', len(allParams))
for param in allParams:
display.progress(1)
class_ = findObjectInCache(param)
if not class_:
createClass(param)
display.finish()
header_re = re.compile('[A-Za-z0-9_-]+')
def headerNameToFilename(name):
"""Transform namespace name into filename
Keep only characters from header_re and change name
in lower case
Parameters
----------
name : str
Name to transform
Returns
-------
str
Computed name
"""
if '::' in name:
parser = CPPPrototypeParser(name)
if not parser.namespaces:
return None
res = parser.namespaces[0].lower()
else:
res = name.lower()
if '.so' in res:
res = os.path.basename(res).split('.so')[0] # Remove .so* extension
res = ''.join([c for c in res if header_re.match(c)])
return res
def outputHeader(namespace, filename):
"""Create header file from namespace description
Parameters
----------
namespace : str
Namespace name
filename : str
Output filename
"""
dependecies = namespace.getDependencies()
# Remove standard dependencies
if 'std' in dependecies:
dependecies.remove('std')
elif '__gnu_cxx' in dependecies:
dependecies.remove('__gnu_cxx')
headers = []
define = '_{}'.format(os.path.basename(filename).upper().replace('.', '_'))
with open(filename, 'w') as fd:
fd.write('/*\n')
fd.write(' File automatically generated by SOAdvancedDissector.py\n')
fd.write(' More information at http://indefero.soutade.fr/p/soadvanceddissector\n')
fd.write('*/\n\n')
fd.write('#ifndef {}\n'.format(define))
fd.write('#define {}\n\n'.format(define))
for dep in dependecies:
headername = headerNameToFilename(dep)
if headername and not headername in headers:
fd.write('#include <{}.h>\n'.format(headername))
# Filter multiple headers
headers.append(headername)
if dependecies:
fd.write('\n\n')
fd.write('{}'.format(namespace))
fd.write('#endif // {}'.format(define))
def writeResult(target, outputDir, cleanOutputDir):
"""Write result into header files (one per namespace)
Parameters
----------
cleanOutputDir : bool
Clean output directory before processing
"""
if cleanOutputDir:
print('Clean {}'.format(outputDir))
shutil.rmtree(outputDir, ignore_errors=True)
if not os.path.exists(outputDir):
os.mkdir(outputDir)
setPrintIndent(True)
keys = namespaces.keys()
display.setTarget(' * Write output files', len(keys))
for namespace in keys:
if namespace == 'std': continue # Don't try to write std classes
filename = '{}.h'.format(headerNameToFilename(namespace))
if namespace == 'global':
filename = '{}.h'.format(headerNameToFilename(target))
outputHeader(namespaces[namespace], '{}/{}'.format(outputDir, filename))
display.progress(1)
display.finish()
setPrintIndent(False)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Extract interfaces (classes, functions, variables) from a GNU/Linux shared library')
parser.add_argument('-f', '--file', help="Target file",
dest="target", required=True)
parser.add_argument('-s', '--section-file', help="Section file (result from 'readelf --sections|c++filt')",
dest="section_file", required=True)
parser.add_argument('-S', '--symbol-file', help="Symbol file (result from 'readelf -sW|c++filt')",
dest="symbol_file", required=True)
parser.add_argument('-V', '--vtable-file', help="Dynamic vtable file (result from 'vtable-dumper --demangle|c++filt')",
dest="vtable_file")
parser.add_argument('-o', '--output-dir', help="output directory (default ./output)",
default="./output", dest="output_dir")
parser.add_argument('-c', '--clean-output-dir', help="Clean output directory before computing (instead update it)",
default=False, action="store_true", dest="clean_output")
parser.add_argument('-r', '--print-raw-virtual-table', help="Print raw virtual table (debug purpose)",
default=False, action="store_true", dest="raw_virtual_table")
args = parser.parse_args()
setPrintRawVirtualTable(args.raw_virtual_table)
print('Analyse {}'.format(args.target))
parseSectionFile(args.section_file)
parseSymbolFile(args.symbol_file)
parseTypeinfo()
parseStaticVtable(args.target)
if args.vtable_file:
parseDynamicVtable(args.vtable_file)
fixupInheritance()
addAllMembers()
if args.vtable_file:
fixBadClassAssertion()
analyseDependencies()
writeResult(args.target, args.output_dir, args.clean_output)
print('Result wrote in {}'.format(args.output_dir))