#!/usr/bin/env python """ ftguess.py ftguess is a Python module to determine the type of a file based on its contents. It can be used as a Python library or a command-line tool. Usage: ftguess ftguess is part of the python-oletools package: http://www.decalage.info/python/oletools """ #=== LICENSE ================================================================= # ftguess is copyright (c) 2018-2022, Philippe Lagadec (http://www.decalage.info) # All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, # are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from __future__ import print_function #------------------------------------------------------------------------------ # CHANGELOG: # 2018-07-04 v0.54 PL: - first version # 2021-05-09 v0.60 PL: - __version__ = '0.60.1' # ------------------------------------------------------------------------------ # TODO: # === IMPORTS ================================================================= import sys import io import zipfile import os import olefile import logging import optparse # import lxml or ElementTree for XML parsing: try: # lxml: best performance for XML processing import lxml.etree as ET except ImportError: try: # Python 2.5+: batteries included import xml.etree.ElementTree as ET except ImportError: try: # Python <2.5: standalone ElementTree install import elementtree.cElementTree as ET except ImportError: raise ImportError("lxml or ElementTree are not installed, " \ + "see http://codespeak.net/lxml " \ + "or http://effbot.org/zone/element-index.htm") # IMPORTANT: it should be possible to run oletools directly as scripts # in any directory without installing them with pip or setup.py. # In that case, relative imports are NOT usable. # And to enable Python 2+3 compatibility, we need to use absolute imports, # so we add the oletools parent folder to sys.path (absolute+normalized path): _thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__))) # print('_thismodule_dir = %r' % _thismodule_dir) _parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..')) # print('_parent_dir = %r' % _thirdparty_dir) if _parent_dir not in sys.path: sys.path.insert(0, _parent_dir) from oletools.common import clsid from oletools.thirdparty.xglob import xglob # === LOGGING ================================================================= class NullHandler(logging.Handler): """ Log Handler without output, to avoid printing messages if logging is not configured by the main application. Python 2.7 has logging.NullHandler, but this is necessary for 2.6: see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library """ def emit(self, record): pass def get_logger(name, level=logging.CRITICAL+1): """ Create a suitable logger object for this module. The goal is not to change settings of the root logger, to avoid getting other modules' logs on the screen. If a logger exists with same name, reuse it. (Else it would have duplicate handlers and messages would be doubled.) The level is set to CRITICAL+1 by default, to avoid any logging. """ # First, test if there is already a logger with the same name, else it # will generate duplicate messages (due to duplicate handlers): if name in logging.Logger.manager.loggerDict: #NOTE: another less intrusive but more "hackish" solution would be to # use getLogger then test if its effective level is not default. logger = logging.getLogger(name) # make sure level is OK: logger.setLevel(level) return logger # get a new logger: logger = logging.getLogger(name) # only add a NullHandler for this logger, it is up to the application # to configure its own logging: logger.addHandler(NullHandler()) logger.setLevel(level) return logger # a global logger object used for debugging: log = get_logger('ftguess') def enable_logging(): """ Enable logging for this module (disabled by default). This will set the module-specific logger level to NOTSET, which means the main application controls the actual logging level. """ log.setLevel(logging.NOTSET) # === CONSTANTS =============================================================== # file types for FileTypeGuesser: class FTYPE(object): """ Constants for file types """ ZIP = 'Zip' WORD = 'Word' WORD6 = 'Word6' WORD97 = 'Word97' WORD2007 = 'Word2007' WORD2007_DOCX = 'Word2007_DOCX' WORD2007_DOTX = 'Word2007_DOTX' WORD2007_DOCM = 'Word2007_DOCM' WORD2007_DOTM = 'Word2007_DOTM' EXCEL = 'Excel' EXCEL5 = 'Excel5' EXCEL97 = 'Excel97' EXCEL2007 = 'Excel2007' EXCEL2007_XLSX = 'Excel2007_XLSX' EXCEL2007_XLSM = 'Excel2007_XLSM' EXCEL2007_XLTX = 'Excel2007_XLTX' EXCEL2007_XLTM = 'Excel2007_XLTM' EXCEL2007_XLSB = 'Excel2007_XLSB' EXCEL2007_XLAM = 'Excel2007_XLAM' POWERPOINT97 = 'Powerpoint97' POWERPOINT2007 = 'Powerpoint2007' POWERPOINT2007_PPTX = 'Powerpoint2007_PPTX' POWERPOINT2007_PPSX = 'Powerpoint2007_PPSX' POWERPOINT2007_PPTM = 'Powerpoint2007_PPTM' POWERPOINT2007_PPSM = 'Powerpoint2007_PPSM' # TODO: DOCM, PPTM, PPSX, PPSM, ... XPS = 'XPS' RTF = 'RTF' HTML = 'HTML' PDF = 'PDF' MHTML = 'MHTML' TEXT = 'TEXT' EXE_PE = 'EXE_PE' GENERIC_OLE = 'OLE' # Generic OLE file GENERIC_XML = 'XML' # Generic XML file GENERIC_OPENXML = 'OpenXML' # Generic OpenXML file UNKNOWN = 'Unknown File Type' class CONTAINER(object): """ Constants for file container types """ RTF = 'RTF' ZIP = 'Zip' OLE = 'OLE' OpenXML = 'OpenXML' FlatOPC = 'FlatOPC' OpenDocument = 'OpenDocument' MIME = 'MIME' BINARY = 'Binary' # Generic binary file without container UNKNOWN = 'Unknown Container' class APP(object): """ Constants for file types """ MSWORD = 'MS Word' MSEXCEL = 'MS Excel' MSPOWERPOINT = 'MS PowerPoint' MSACCESS = 'MS Access' MSVISIO = 'MS Visio' MSPROJECT = 'MS Project' MSOFFICE = 'MS Office' # when the exact app is unknown ZIP_ARCHIVER = 'Any Zip Archiver' WINDOWS = 'Windows' # for Windows executables and XPS UNKNOWN = 'Unknown Application' # FTYPE_NAME = { # FTYPE_ZIP: 'Zip archive', # FTYPE_WORD97: 'MS Word 97-2000 Document', # } # Namespaces and tags for OpenXML parsing`- RELS files: # root: NS_RELS = '{http://schemas.openxmlformats.org/package/2006/relationships}' TAG_RELS = NS_RELS + 'Relationships' # TAG_REL = NS_RELS + 'Relationship' ATTR_REL_TYPE = 'Type' ATTR_REL_TARGET = 'Target' URL_REL_OFFICEDOC = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" # For "strict" OpenXML formats, the URL is different: URL_REL_OFFICEDOC_STRICT = 'http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument' # Url for xps files URL_REL_XPS = 'http://schemas.microsoft.com/xps/2005/06/fixedrepresentation' # Namespaces and tags for OpenXML parsing`- Content-types file: NS_CONTENT_TYPES = '{http://schemas.openxmlformats.org/package/2006/content-types}' TAG_CTYPES_DEFAULT = NS_CONTENT_TYPES + 'Default' TAG_CTYPES_OVERRIDE = NS_CONTENT_TYPES + 'Override' # Namespaces and tags for Word/PowerPoint 2007+ XML parsing: # root: NS_XMLPACKAGE = '{http://schemas.microsoft.com/office/2006/xmlPackage}' TAG_PACKAGE = NS_XMLPACKAGE + 'package' # the tag includes that contains the VBA macro code in Base64: # TAG_PKGPART = NS_XMLPACKAGE + 'part' ATTR_PKG_NAME = NS_XMLPACKAGE + 'name' ATTR_PKG_CONTENTTYPE = NS_XMLPACKAGE + 'contentType' CTYPE_VBAPROJECT = "application/vnd.ms-office.vbaProject" TAG_PKGBINDATA = NS_XMLPACKAGE + 'binaryData' # === CLASSES ================================================================ class FType_Base (object): container = CONTAINER.UNKNOWN application = APP.UNKNOWN filetype = FTYPE.UNKNOWN name = "Unknown file type" longname = "Unknown file type" extensions = [] # list of common file extensions used for the format content_types = [] # list of MIME content-types (can be several) PUID = None # PRONOM Unique ID - see https://www.nationalarchives.gov.uk/PRONOM/Default.aspx may_contain_vba = False may_contain_xlm = False may_contain_ole = False @classmethod def recognize(cls, ftg): """ return True if the provided file matches the type of this class :param ftg: FileTypeGuesser object :return: bool """ return False class FType_Unknown(FType_Base): pass class FType_RTF(FType_Base): container = CONTAINER.RTF application = APP.MSWORD filetype = FTYPE.RTF name = 'RTF' longname = 'Rich Text Format' extensions = ['rtf', 'doc'] content_types = ('application/rtf', 'text/rtf') PUID = 'fmt/355' # RTF 1.9 (from Word 2007) @classmethod def recognize(cls, ftg): # print('checking RTF') # print(repr(data[0:4])) return True if ftg.data.startswith(b'{\\rt') else False class FType_Generic_OLE(FType_Base): container = CONTAINER.OLE application = APP.UNKNOWN filetype = FTYPE.GENERIC_OLE name = 'Generic OLE/CFB file' longname = 'Generic OLE file / Compound File (unknown format)' @classmethod def recognize(cls, ftg): # Here there's an issue with non-OLE files smaller than 1536 bytes # see https://github.com/decalage2/olefile/issues/142 # Workaround: pad data when it's smaller than 1536 bytes # TODO: use the new data parameter of isOleFile when it's implemented if len(ftg.data)<1536: data = ftg.data + (b'\x00'*1536) else: data = ftg.data if olefile.isOleFile(data): # open the OLE file try: # Open and parse the OLE file: ftg.olefile = olefile.OleFileIO(ftg.data) # Extract the CLSID of the root storage ftg.root_clsid = ftg.olefile.root.clsid ftg.root_clsid_name = clsid.KNOWN_CLSIDS.get(ftg.root_clsid, None) except: # TODO: log the error return False return True else: return False class FType_OLE_CLSID_Base(FType_Generic_OLE): """ Base class to recognize OLE files based on CLSID or stream names """ CLSIDS = [] STREAMS = [] @classmethod def recognize(cls, ftg): # TODO: refactor, this is not used anymore if ftg.root_clsid is not None: # First, attempt to identify the root storage CLSID: if ftg.root_clsid in cls.CLSIDS: return True else: return False else: # Second, check the presence of well-known stream names # TODO: check if a Word doc is OK without a clsid return False class FType_Generic_Zip(FType_Base): container = CONTAINER.ZIP application = APP.ZIP_ARCHIVER filetype = FTYPE.ZIP name = 'Zip Archive' longname = 'Generic Zip Archive' extensions = ['zip'] @classmethod def recognize(cls, ftg): # First, call is_zipfile to discard non zip archives: if not zipfile.is_zipfile(ftg.data_bytesio): return False # Second, attempt to open the zip file for further processing: try: ftg.zipfile = zipfile.ZipFile(ftg.data_bytesio) except zipfile.BadZipfile: # this exception happens only when the zip file could not be opened properly # it should not catch other potential errors return False return True class FType_Generic_OpenXML(FType_Base): container = CONTAINER.OpenXML application = APP.MSOFFICE filetype = FTYPE.GENERIC_OPENXML name = 'OpenXML file' longname = 'Generic OpenXML file' extensions = [] @classmethod def recognize(cls, ftg): log.debug('Open XML - recognize') # TODO: move most of this code to ooxml.py # TODO: here it can be either forward or backward slash... try: ftg.zipfile.getinfo('_rels/.rels') except KeyError: return False try: root_rels = ftg.zipfile.read('_rels/.rels') except RuntimeError: return False # parse the XML content # TODO: handle XML parsing exceptions elem_rels = ET.fromstring(root_rels) # check root: if elem_rels.tag != TAG_RELS: return False main_part = None for elem_rel in elem_rels.iter(tag=TAG_REL): rel_type = elem_rel.get(ATTR_REL_TYPE) log.debug('Relationship: type=%s target=%s' % (rel_type, elem_rel.get(ATTR_REL_TARGET))) if rel_type in (URL_REL_OFFICEDOC, URL_REL_OFFICEDOC_STRICT, URL_REL_XPS): # TODO: is it useful to distinguish normal and strict OpenXML? main_part = elem_rel.get(ATTR_REL_TARGET) # TODO: raise anomaly if there are more than one rel with type office doc break log.debug('Main part: %s' % main_part) # if main_part is not None: # try: # main_part_xml = ftg.zipfile.read(main_part) # except RuntimeError: # return False # # TODO: handle XML parsing exceptions # elem_main_part = ET.fromstring(main_part_xml) # #print(elem_main_part.tag) # # Save XML tag of main part to determine actual format # ftg.main_part_xmltag = elem_main_part.tag # else: # # TODO: log error, raise anomaly (or maybe it's the case for XPS?) # return False if main_part is None: # just warn but do not raise an exception. This might be just # another strange data type out there that we do not understand # yet. Return False so file type will stay FType_Generic_OpenXML log.warning('Failed to find any known relationship in OpenXML-file') # TODO: here we should recognize a generic OpenXML type instead of returning False return False # parse content types, find content type of main part try: content_types = ftg.zipfile.read('[Content_Types].xml') except RuntimeError: return False # parse the XML content # TODO: handle XML parsing exceptions elem_ctypes = ET.fromstring(content_types) ctypes_ext = {} ctypes_part = {} for elem_ext in elem_ctypes.iter(tag = TAG_CTYPES_DEFAULT): extension = elem_ext.get('Extension') content_type = elem_ext.get('ContentType') # print('Ext: %s => Content-type: %s' % (extension, content_type)) if extension is not None and content_type is not None: ctypes_ext[extension] = content_type for elem_part in elem_ctypes.iter(tag = TAG_CTYPES_OVERRIDE): partname = elem_part.get('PartName') # remove leading slash if present partname = partname.lstrip('/') content_type = elem_part.get('ContentType') # print('Part: %s => Content-type: %s' % (partname, content_type)) if partname is not None and content_type is not None: ctypes_part[partname] = content_type # find content-type of the main part, first by part name, second by extension: main_part_content_type = None if main_part in ctypes_part: main_part_content_type = ctypes_part[main_part] else: # extract extension from part name, without leading dot main_part_ext = os.path.splitext(main_part)[1][1:] if main_part_ext in ctypes_ext: main_part_content_type = ctypes_ext[main_part_ext] ftg.main_part_content_type = main_part_content_type log.debug('Main part content-type: %s' % main_part_content_type) return True # --- WORD Formats --- class FType_Word(FType_Base): '''Base class for all MS Word file types''' application = APP.MSWORD name = 'MS Word (generic)' longname = 'MS Word Document or Template (generic)' class FType_Word97(FType_OLE_CLSID_Base, FType_Word): application = APP.MSWORD filetype = FTYPE.WORD97 name = 'MS Word 97 Document' longname = 'MS Word 97-2003 Document or Template' CLSIDS = ('00020906-0000-0000-C000-000000000046',) extensions = ['doc', 'dot'] content_types = ['application/msword'] PUID = 'fmt/40' may_contain_vba = True may_contain_ole = True # TODO: if no CLSID, check stream 'WordDocument' class FType_Word6(FType_OLE_CLSID_Base, FType_Word): application = APP.MSWORD filetype = FTYPE.WORD6 name = 'MS Word 6 Document' longname = 'MS Word 6-7 Document or Template' CLSIDS = ('00020900-0000-0000-C000-000000000046',) extensions = ['doc', 'dot'] content_types = ['application/msword'] PUID = 'fmt/39' may_contain_ole = True class FType_Word2007_Base(FType_Generic_OpenXML, FType_Word): application = APP.MSWORD name = 'MS Word 2007+ File' longname = 'MS Word 2007+ File (.doc?)' class FType_Word2007(FType_Word2007_Base): application = APP.MSWORD filetype = FTYPE.WORD2007_DOCX name = 'MS Word 2007+ Document' longname = 'MS Word 2007+ Document (.docx)' extensions = ['docx'] class FType_Word2007_Macro(FType_Word2007_Base): application = APP.MSWORD filetype = FTYPE.WORD2007_DOCM name = 'MS Word 2007+ Macro-Enabled Document' longname = 'MS Word 2007+ Macro-Enabled Document (.docm)' extensions = ['docm'] class FType_Word2007_Template(FType_Word2007_Base): application = APP.MSWORD filetype = FTYPE.WORD2007_DOTX name = 'MS Word 2007+ Template' longname = 'MS Word 2007+ Template (.dotx)' extensions = ['dotx'] class FType_Word2007_Template_Macro(FType_Word2007_Base): application = APP.MSWORD filetype = FTYPE.WORD2007_DOTM name = 'MS Word 2007+ Macro-Enabled Template' longname = 'MS Word 2007+ Macro-Enabled Template (.dotm)' extensions = ['dotm'] # --- EXCEL Formats --- class FType_Excel(FType_Base): '''Base class for all MS Excel file types''' application = APP.MSEXCEL name = 'MS Excel (generic)' longname = 'MS Excel Workbook/Template/Add-in (generic)' class FType_Excel97(FType_Excel, FType_Generic_OLE): filetype = FTYPE.EXCEL97 name = 'MS Excel 97 Workbook' longname = 'MS Excel 97-2003 Workbook or Template' CLSIDS = ('00020820-0000-0000-C000-000000000046',) extensions = ['xls', 'xlt', 'xla'] # TODO: if no CLSID, check stream 'Workbook' or 'Book' (maybe Excel 5) class FType_Excel5(FType_Excel, FType_Generic_OLE): filetype = FTYPE.EXCEL5 name = 'MS Excel 5.0/95 Workbook' longname = 'MS Excel 5.0/95 Workbook, Template or Add-in' CLSIDS = ('00020810-0000-0000-C000-000000000046',) extensions = ['xls', 'xlt', 'xla'] # TODO: this CLSID is also used in Excel addins (.xla) saved by MS Excel 365 class FType_Excel2007(FType_Excel, FType_Generic_OpenXML): '''Base class for all MS Excel 2007 file types''' name = 'MS Excel 2007+ (generic)' longname = 'MS Excel 2007+ Workbook or Template (generic)' content_types = ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) # note: content type differs only for xlsm class FType_Excel2007_XLSX (FType_Excel2007): filetype = FTYPE.EXCEL2007_XLSX name = 'MS Excel 2007+ Workbook' longname = 'MS Excel 2007+ Workbook (.xlsx)' extensions = ['xlsx'] PUID = 'fmt/214' class FType_Excel2007_XLSM (FType_Excel2007): filetype = FTYPE.EXCEL2007_XLSM name = 'MS Excel 2007+ Macro-Enabled Workbook' longname = 'MS Excel 2007+ Macro-Enabled Workbook (.xlsm)' extensions = ['xlsm'] content_types = ('application/vnd.ms-excel.sheet.macroEnabled.12',) PUID = 'fmt/445' class FType_Excel2007_XLSB (FType_Excel2007): filetype = FTYPE.EXCEL2007_XLSB name = 'MS Excel 2007+ Binary Workbook' longname = 'MS Excel 2007+ Binary Workbook (.xlsb)' extensions = ['xlsb'] content_types = ('application/vnd.ms-excel.sheet.binary.macroEnabled.12',) PUID = 'fmt/595' class FType_Excel2007_Template(FType_Excel2007): filetype = FTYPE.EXCEL2007_XLTX name = 'MS Excel 2007+ Template' longname = 'MS Excel 2007+ Template (.xltx)' extensions = ['xltx'] class FType_Excel2007_Template_Macro(FType_Excel2007): filetype = FTYPE.EXCEL2007_XLTM name = 'MS Excel 2007+ Macro-Enabled Template' longname = 'MS Excel 2007+ Macro-Enabled Template (.xltm)' extensions = ['xltm'] class FType_Excel2007_Addin_Macro(FType_Excel2007): filetype = FTYPE.EXCEL2007_XLAM name = 'MS Excel 2007+ Macro-Enabled Add-in' longname = 'MS Excel 2007+ Macro-Enabled Add-in (.xlam)' extensions = ['xlam'] # --- POWERPOINT Formats --- class FType_Powerpoint(FType_Base): '''Base class for all MS Powerpoint file types''' application = APP.MSPOWERPOINT name = 'MS Powerpoint (generic)' longname = 'MS Powerpoint Presentation/Slideshow/Template/Addin/... (generic)' class FType_Powerpoint97(FType_Powerpoint, FType_Generic_OLE): # see also: ppt_record_parser.is_ppt filetype = FTYPE.POWERPOINT97 name = 'MS Powerpoint 97 Presentation' longname = 'MS Powerpoint 97-2003 Presentation/Slideshow/Template' CLSIDS = ('64818D10-4F9B-11CF-86EA-00AA00B929E8',) extensions = ['ppt', 'pps', 'pot'] class FType_Powerpoint2007(FType_Powerpoint, FType_Generic_OpenXML): '''Base class for all MS Powerpoint 2007 file types''' filetype = FTYPE.POWERPOINT2007 name = 'MS Powerpoint 2007+ (generic)' longname = 'MS Powerpoint 2007+ Presentation/Slideshow/Template (generic)' content_types = ('application/vnd.openxmlformats-officedocument.presentationml.presentation',) class FType_Powerpoint2007_Presentation(FType_Powerpoint2007): filetype = FTYPE.POWERPOINT2007_PPTX name = 'MSPointpoint 2007+ Presentation' longname = 'MSPointpoint 2007+ Presentation (.pptx)' extensions = ['pptx'] class FType_Powerpoint2007_Slideshow(FType_Powerpoint2007): filetype = FTYPE.POWERPOINT2007_PPSX name = 'MSPointpoint 2007+ Slideshow' longname = 'MSPointpoint 2007+ Slideshow (.ppsx)' extensions = ['ppsx'] class FType_Powerpoint2007_Macro(FType_Powerpoint2007): filetype = FTYPE.POWERPOINT2007_PPTM name = 'MSPointpoint 2007+ Macro-Enabled Presentation' longname = 'MSPointpoint 2007+ Macro-Enabled Presentation (.pptm)' extensions = ['pptm'] class FType_Powerpoint2007_Slideshow_Macro(FType_Powerpoint2007): filetype = FTYPE.POWERPOINT2007_PPSM name = 'MSPointpoint 2007+ Macro-Enabled Slideshow' longname = 'MSPointpoint 2007+ Macro-Enabled Slideshow (.ppsm)' extensions = ['ppsm'] class FType_XPS(FType_Generic_OpenXML): application = APP.WINDOWS filetype = FTYPE.XPS name = 'XPS' longname = 'Fixed-Page Document (.xps)', extensions = ['xps'] # TODO: for PPT, check for stream 'PowerPoint Document' # TODO: for Visio, check for stream 'VisioDocument' clsid_ftypes = { # mapping from CLSID of root storage to FType classes: # TODO: do not repeat magic numbers, import from oletools.common.clsid # WORD '00020906-0000-0000-C000-000000000046': FType_Word97, '00020900-0000-0000-C000-000000000046': FType_Word6, # EXCEL '00020820-0000-0000-C000-000000000046': FType_Excel97, '00020810-0000-0000-C000-000000000046': FType_Excel5, # POWERPOINT '64818D10-4F9B-11CF-86EA-00AA00B929E8': FType_Powerpoint97, } openxml_ftypes = { # mapping from content-type of main part to FType classes: # WORD 'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml': FType_Word2007, 'application/vnd.ms-word.document.macroEnabled.main+xml': FType_Word2007_Macro, 'application/vnd.openxmlformats-officedocument.wordprocessingml.template.main+xml': FType_Word2007_Template, 'application/vnd.ms-word.template.macroEnabledTemplate.main+xml': FType_Word2007_Template_Macro, # EXCEL 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml': FType_Excel2007_XLSX, 'application/vnd.ms-excel.sheet.macroEnabled.main+xml': FType_Excel2007_XLSM, 'application/vnd.ms-excel.sheet.binary.macroEnabled.main': FType_Excel2007_XLSB, 'application/vnd.openxmlformats-officedocument.spreadsheetml.template.main+xml': FType_Excel2007_Template, 'application/vnd.ms-excel.template.macroEnabled.main+xml': FType_Excel2007_Template_Macro, 'application/vnd.ms-excel.addin.macroEnabled.main+xml': FType_Excel2007_Addin_Macro, # POWERPOINT 'application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml': FType_Powerpoint2007_Presentation, 'application/vnd.openxmlformats-officedocument.presentationml.slideshow.main+xml': FType_Powerpoint2007_Slideshow, 'application/vnd.ms-powerpoint.presentation.macroEnabled.main+xml': FType_Powerpoint2007_Macro, 'application/vnd.ms-powerpoint.slideshow.macroEnabled.main+xml': FType_Powerpoint2007_Slideshow_Macro, # XPS 'application/vnd.ms-package.xps-fixeddocumentsequence+xml': FType_XPS, } class FType_EXE_PE (FType_Base): filetype = FTYPE.EXE_PE container = CONTAINER.BINARY application = APP.WINDOWS name = "Windows PE Executable or DLL" longname = "Windows Portable Executable or DLL (EXE,DLL)" extensions = ('exe', 'dll', 'sys', 'scr') # TODO: add more from https://en.wikipedia.org/wiki/Portable_Executable content_types = ('application/vnd.microsoft.portable-executable',) PUID = 'fmt/899' @classmethod def recognize(cls, ftg): return True if ftg.data.startswith(b'MZ') else False # TODO: make this more accurate by checking the PE header, e.g. using pefile or directly class FileTypeGuesser(object): """ A class to guess the type of a file, focused on MS Office, RTF and ZIP. """ def __init__(self, filepath=None, data=None): self.filepath = filepath self.data = data self.container = None self.application = None self.filetype = None self.ftype = FType_Unknown # FType class self.data_bytesio = None # For OLE: self.olefile = None self.root_clsid = None self.root_clsid_name = None # For ZIP: self.zipfile = None # For OpenXML: self.root_rels = None # self.main_part_xmltag = None self.main_part_content_type = None # For XML: self.root_xmltag = None self.xmlroot = None if filepath is None and data is None: raise ValueError('FileTypeGuesser requires either a file path or file data, or both') if data is None: with open(filepath, 'rb') as f: self.data = f.read() self.data_bytesio = io.BytesIO(self.data) # Identify the main container type: for ftype in (FType_RTF, FType_Generic_OLE, FType_Generic_Zip): if ftype.recognize(self): self.ftype = ftype break self.container = self.ftype.container self.filetype = self.ftype.filetype self.application = self.ftype.application # OLE file types: if self.container == CONTAINER.OLE: # for ftype in (FType_Word97, FType_Word6, FType_Excel97, FType_Excel5): # if ftype.recognize(self): # self.ftype = ftype # break ft = clsid_ftypes.get(self.root_clsid, None) if ft is not None: self.ftype = ft # OpenXML file types: if self.container == CONTAINER.ZIP: if FType_Generic_OpenXML.recognize(self): self.ftype = FType_Generic_OpenXML ft = openxml_ftypes.get(self.main_part_content_type, None) if ft is not None: self.ftype = ft # TODO: use a mapping from magic to file types if self.container == CONTAINER.UNKNOWN: if FType_EXE_PE.recognize(self): self.ftype = FType_EXE_PE self.container = self.ftype.container self.filetype = self.ftype.filetype self.application = self.ftype.application def __str__(self): """Give a short string representation of this object.""" return '[FileTypeGuesser for {0}: {1} from {2} in {3}]'.format( "data" if self.filepath is None else os.path.basename(self.filepath), self.filetype, self.application, self.container) def close(self): """ This method must be called at the end of processing """ # TODO: only close self.olefile if it was opened by ftguess if self.zipfile is not None: self.zipfile.close() def is_ole(self): """ Shortcut to check if the container is OLE :return: bool """ return issubclass(self.ftype, FType_Generic_OLE) or self.container == CONTAINER.OLE def is_openxml(self): """ Shortcut to check if the container is OpenXML :return: bool """ return issubclass(self.ftype, FType_Generic_OpenXML) or self.container == CONTAINER.OpenXML def is_word(self): """ Shortcut to check if a file is an Excel workbook, template or add-in :return: bool """ return issubclass(self.ftype, FType_Word) def is_excel(self): """ Shortcut to check if a file is an Excel workbook, template or add-in :return: bool """ return issubclass(self.ftype, FType_Excel) def is_powerpoint(self): """ Shortcut to check if a file is Powerpoint file of any kind :return: bool """ return issubclass(self.ftype, FType_Powerpoint) # === FUNCTIONS ============================================================== def ftype_guess(filepath=None, data=None): return FileTypeGuesser(filepath, data) def process_file(container, filename, data): print('File : %s' % filename) ftg = ftype_guess(filepath=filename, data=data) print('File Type : %s' % ftg.ftype.name) print('Description: %s' % ftg.ftype.longname) print('Application: %s' % ftg.ftype.application) print('Container : %s' % ftg.container) if ftg.root_clsid is not None: print('Root CLSID : %s - %s' % (ftg.root_clsid, ftg.root_clsid_name)) print('Content-type(s) : %s' % ','.join(ftg.ftype.content_types)) print('PUID : %s' % ftg.ftype.PUID) print() #=== MAIN ================================================================= def main(): # print banner with version python_version = '%d.%d.%d' % sys.version_info[0:3] print ('ftguess %s on Python %s - http://decalage.info/python/oletools' % (__version__, python_version)) print ('THIS IS WORK IN PROGRESS - Check updates regularly!') print ('Please report any issue at https://github.com/decalage2/oletools/issues') print ('') DEFAULT_LOG_LEVEL = "warning" # Default log level LOG_LEVELS = { 'debug': logging.DEBUG, 'info': logging.INFO, 'warning': logging.WARNING, 'error': logging.ERROR, 'critical': logging.CRITICAL } usage = 'usage: %prog [options] [filename2 ...]' parser = optparse.OptionParser(usage=usage) # parser.add_option('-c', '--csv', dest='csv', # help='export results to a CSV file') parser.add_option("-r", action="store_true", dest="recursive", help='find files recursively in subdirectories.') parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None, help='if the file is a zip archive, open first file from it, using the provided password') parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*', help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)') parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL, help="logging level debug/info/warning/error/critical (default=%default)") (options, args) = parser.parse_args() # Print help if no arguments are passed if len(args) == 0: print (__doc__) parser.print_help() sys.exit() # Setup logging to the console: # here we use stdout instead of stderr by default, so that the output # can be redirected properly. logging.basicConfig(level=LOG_LEVELS[options.loglevel], stream=sys.stdout, format='%(levelname)-8s %(message)s') # enable logging in the modules: enable_logging() for container, filename, data in xglob.iter_files(args, recursive=options.recursive, zip_password=options.zip_password, zip_fname=options.zip_fname): # ignore directory names stored in zip files: if container and filename.endswith('/'): continue process_file(container, filename, data) if __name__ == '__main__': main()