#!/usr/bin/env python
"""
ftguess.py

ftguess is a Python module to determine the type of a file based on its contents.
It can be used as a Python library or a command-line tool.

Usage: ftguess <file>

ftguess is part of the python-oletools package:
http://www.decalage.info/python/oletools
"""

#=== LICENSE =================================================================

# ftguess is copyright (c) 2018-2022, Philippe Lagadec (http://www.decalage.info)
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
#  * Redistributions of source code must retain the above copyright notice, this
#    list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

from __future__ import print_function

#------------------------------------------------------------------------------
# CHANGELOG:
# 2018-07-04 v0.54 PL: - first version
# 2021-05-09 v0.60 PL: -

__version__ = '0.60.1'

# ------------------------------------------------------------------------------
# TODO:


# === IMPORTS =================================================================

import sys
import io
import zipfile
import os
import olefile
import logging
import optparse

# import lxml or ElementTree for XML parsing:
try:
    # lxml: best performance for XML processing
    import lxml.etree as ET
except ImportError:
    try:
        # Python 2.5+: batteries included
        import xml.etree.ElementTree as ET
    except ImportError:
        try:
            # Python <2.5: standalone ElementTree install
            import elementtree.cElementTree as ET
        except ImportError:
            raise ImportError("lxml or ElementTree are not installed, " \
                               + "see http://codespeak.net/lxml " \
                               + "or http://effbot.org/zone/element-index.htm")

# IMPORTANT: it should be possible to run oletools directly as scripts
# in any directory without installing them with pip or setup.py.
# In that case, relative imports are NOT usable.
# And to enable Python 2+3 compatibility, we need to use absolute imports,
# so we add the oletools parent folder to sys.path (absolute+normalized path):
_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__)))
# print('_thismodule_dir = %r' % _thismodule_dir)
_parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..'))
# print('_parent_dir = %r' % _thirdparty_dir)
if _parent_dir not in sys.path:
    sys.path.insert(0, _parent_dir)

from oletools.common import clsid
from oletools.thirdparty.xglob import xglob

# === LOGGING =================================================================

class NullHandler(logging.Handler):
    """
    Log Handler without output, to avoid printing messages if logging is not
    configured by the main application.
    Python 2.7 has logging.NullHandler, but this is necessary for 2.6:
    see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library
    """
    def emit(self, record):
        pass

def get_logger(name, level=logging.CRITICAL+1):
    """
    Create a suitable logger object for this module.
    The goal is not to change settings of the root logger, to avoid getting
    other modules' logs on the screen.
    If a logger exists with same name, reuse it. (Else it would have duplicate
    handlers and messages would be doubled.)
    The level is set to CRITICAL+1 by default, to avoid any logging.
    """
    # First, test if there is already a logger with the same name, else it
    # will generate duplicate messages (due to duplicate handlers):
    if name in logging.Logger.manager.loggerDict:
        #NOTE: another less intrusive but more "hackish" solution would be to
        # use getLogger then test if its effective level is not default.
        logger = logging.getLogger(name)
        # make sure level is OK:
        logger.setLevel(level)
        return logger
    # get a new logger:
    logger = logging.getLogger(name)
    # only add a NullHandler for this logger, it is up to the application
    # to configure its own logging:
    logger.addHandler(NullHandler())
    logger.setLevel(level)
    return logger

# a global logger object used for debugging:
log = get_logger('ftguess')

def enable_logging():
    """
    Enable logging for this module (disabled by default).
    This will set the module-specific logger level to NOTSET, which
    means the main application controls the actual logging level.
    """
    log.setLevel(logging.NOTSET)

# === CONSTANTS ===============================================================

# file types for FileTypeGuesser:
class FTYPE(object):
    """
    Constants for file types
    """
    ZIP = 'Zip'
    WORD = 'Word'
    WORD6 = 'Word6'
    WORD97 = 'Word97'
    WORD2007 = 'Word2007'
    WORD2007_DOCX = 'Word2007_DOCX'
    WORD2007_DOTX = 'Word2007_DOTX'
    WORD2007_DOCM = 'Word2007_DOCM'
    WORD2007_DOTM = 'Word2007_DOTM'
    EXCEL = 'Excel'
    EXCEL5 = 'Excel5'
    EXCEL97 = 'Excel97'
    EXCEL2007 = 'Excel2007'
    EXCEL2007_XLSX = 'Excel2007_XLSX'
    EXCEL2007_XLSM = 'Excel2007_XLSM'
    EXCEL2007_XLTX = 'Excel2007_XLTX'
    EXCEL2007_XLTM = 'Excel2007_XLTM'
    EXCEL2007_XLSB = 'Excel2007_XLSB'
    EXCEL2007_XLAM = 'Excel2007_XLAM'
    POWERPOINT97 = 'Powerpoint97'
    POWERPOINT2007 = 'Powerpoint2007'
    POWERPOINT2007_PPTX = 'Powerpoint2007_PPTX'
    POWERPOINT2007_PPSX = 'Powerpoint2007_PPSX'
    POWERPOINT2007_PPTM = 'Powerpoint2007_PPTM'
    POWERPOINT2007_PPSM = 'Powerpoint2007_PPSM'
    # TODO: DOCM, PPTM, PPSX, PPSM, ...
    XPS = 'XPS'
    RTF = 'RTF'
    HTML = 'HTML'
    PDF = 'PDF'
    MHTML = 'MHTML'
    TEXT = 'TEXT'
    EXE_PE = 'EXE_PE'
    GENERIC_OLE = 'OLE' # Generic OLE file
    GENERIC_XML = 'XML' # Generic XML file
    GENERIC_OPENXML = 'OpenXML' # Generic OpenXML file
    UNKNOWN = 'Unknown File Type'

class CONTAINER(object):
    """
    Constants for file container types
    """
    RTF = 'RTF'
    ZIP = 'Zip'
    OLE = 'OLE'
    OpenXML = 'OpenXML'
    FlatOPC = 'FlatOPC'
    OpenDocument = 'OpenDocument'
    MIME = 'MIME'
    BINARY = 'Binary'  # Generic binary file without container
    UNKNOWN = 'Unknown Container'

class APP(object):
    """
    Constants for file types
    """
    MSWORD = 'MS Word'
    MSEXCEL = 'MS Excel'
    MSPOWERPOINT = 'MS PowerPoint'
    MSACCESS = 'MS Access'
    MSVISIO = 'MS Visio'
    MSPROJECT = 'MS Project'
    MSOFFICE = 'MS Office'  # when the exact app is unknown
    ZIP_ARCHIVER = 'Any Zip Archiver'
    WINDOWS = 'Windows'  # for Windows executables and XPS
    UNKNOWN = 'Unknown Application'

# FTYPE_NAME = {
#     FTYPE_ZIP: 'Zip archive',
#     FTYPE_WORD97: 'MS Word 97-2000 Document',
# }

# Namespaces and tags for OpenXML parsing`- RELS files:
# root: <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
NS_RELS = '{http://schemas.openxmlformats.org/package/2006/relationships}'
TAG_RELS = NS_RELS + 'Relationships'
# <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="xl/workbook.bin"/>
TAG_REL = NS_RELS + 'Relationship'
ATTR_REL_TYPE = 'Type'
ATTR_REL_TARGET = 'Target'
URL_REL_OFFICEDOC = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"
# For "strict" OpenXML formats, the URL is different:
URL_REL_OFFICEDOC_STRICT = 'http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument'
# Url for xps files
URL_REL_XPS = 'http://schemas.microsoft.com/xps/2005/06/fixedrepresentation'
# Namespaces and tags for OpenXML parsing`- Content-types file:
NS_CONTENT_TYPES = '{http://schemas.openxmlformats.org/package/2006/content-types}'
TAG_CTYPES_DEFAULT = NS_CONTENT_TYPES + 'Default'
TAG_CTYPES_OVERRIDE = NS_CONTENT_TYPES + 'Override'


# Namespaces and tags for Word/PowerPoint 2007+ XML parsing:
# root: <pkg:package xmlns:pkg="http://schemas.microsoft.com/office/2006/xmlPackage">
NS_XMLPACKAGE = '{http://schemas.microsoft.com/office/2006/xmlPackage}'
TAG_PACKAGE = NS_XMLPACKAGE + 'package'
# the tag <pkg:part> includes <pkg:binaryData> that contains the VBA macro code in Base64:
# <pkg:part pkg:name="/word/vbaProject.bin" pkg:contentType="application/vnd.ms-office.vbaProject"><pkg:binaryData>
TAG_PKGPART = NS_XMLPACKAGE + 'part'
ATTR_PKG_NAME = NS_XMLPACKAGE + 'name'
ATTR_PKG_CONTENTTYPE = NS_XMLPACKAGE + 'contentType'
CTYPE_VBAPROJECT = "application/vnd.ms-office.vbaProject"
TAG_PKGBINDATA = NS_XMLPACKAGE + 'binaryData'




# === CLASSES ================================================================

class FType_Base (object):
    container = CONTAINER.UNKNOWN
    application = APP.UNKNOWN
    filetype = FTYPE.UNKNOWN
    name = "Unknown file type"
    longname = "Unknown file type"
    extensions = []  # list of common file extensions used for the format
    content_types = []  # list of MIME content-types (can be several)
    PUID = None  # PRONOM Unique ID - see https://www.nationalarchives.gov.uk/PRONOM/Default.aspx
    may_contain_vba = False
    may_contain_xlm = False
    may_contain_ole = False

    @classmethod
    def recognize(cls, ftg):
        """
        return True if the provided file matches the type of this class
        :param ftg: FileTypeGuesser object
        :return: bool
        """
        return False

class FType_Unknown(FType_Base):
    pass

class FType_RTF(FType_Base):
    container = CONTAINER.RTF
    application = APP.MSWORD
    filetype = FTYPE.RTF
    name = 'RTF'
    longname = 'Rich Text Format'
    extensions = ['rtf', 'doc']
    content_types = ('application/rtf', 'text/rtf')
    PUID = 'fmt/355'  # RTF 1.9 (from Word 2007)

    @classmethod
    def recognize(cls, ftg):
        # print('checking RTF')
        # print(repr(data[0:4]))
        return True if ftg.data.startswith(b'{\\rt') else False


class FType_Generic_OLE(FType_Base):
    container = CONTAINER.OLE
    application = APP.UNKNOWN
    filetype = FTYPE.GENERIC_OLE
    name = 'Generic OLE/CFB file'
    longname = 'Generic OLE file / Compound File (unknown format)'

    @classmethod
    def recognize(cls, ftg):
        # Here there's an issue with non-OLE files smaller than 1536 bytes
        # see https://github.com/decalage2/olefile/issues/142
        # Workaround: pad data when it's smaller than 1536 bytes
        # TODO: use the new data parameter of isOleFile when it's implemented
        if len(ftg.data)<1536:
            data = ftg.data + (b'\x00'*1536)
        else:
            data = ftg.data
        if olefile.isOleFile(data):
            # open the OLE file
            try:
                # Open and parse the OLE file:
                ftg.olefile = olefile.OleFileIO(ftg.data)
                # Extract the CLSID of the root storage
                ftg.root_clsid = ftg.olefile.root.clsid
                ftg.root_clsid_name = clsid.KNOWN_CLSIDS.get(ftg.root_clsid, None)
            except:
                # TODO: log the error
                return False
            return True
        else:
            return False


class FType_OLE_CLSID_Base(FType_Generic_OLE):
    """
    Base class to recognize OLE files based on CLSID or stream names
    """
    CLSIDS = []
    STREAMS = []

    @classmethod
    def recognize(cls, ftg):
        # TODO: refactor, this is not used anymore
        if ftg.root_clsid is not None:
            # First, attempt to identify the root storage CLSID:
            if ftg.root_clsid in cls.CLSIDS:
                return True
            else:
                return False
        else:
            # Second, check the presence of well-known stream names
            # TODO: check if a Word doc is OK without a clsid
            return False

class FType_Generic_Zip(FType_Base):
    container = CONTAINER.ZIP
    application = APP.ZIP_ARCHIVER
    filetype = FTYPE.ZIP
    name = 'Zip Archive'
    longname = 'Generic Zip Archive'
    extensions = ['zip']

    @classmethod
    def recognize(cls, ftg):
        # First, call is_zipfile to discard non zip archives:
        if not zipfile.is_zipfile(ftg.data_bytesio):
            return False
        # Second, attempt to open the zip file for further processing:
        try:
            ftg.zipfile = zipfile.ZipFile(ftg.data_bytesio)
        except zipfile.BadZipfile:
            # this exception happens only when the zip file could not be opened properly
            # it should not catch other potential errors
            return False
        return True


class FType_Generic_OpenXML(FType_Base):
    container = CONTAINER.OpenXML
    application = APP.MSOFFICE
    filetype = FTYPE.GENERIC_OPENXML
    name = 'OpenXML file'
    longname = 'Generic OpenXML file'
    extensions = []

    @classmethod
    def recognize(cls, ftg):
        log.debug('Open XML - recognize')
        # TODO: move most of this code to ooxml.py
        # TODO: here it can be either forward or backward slash...
        try:
            ftg.zipfile.getinfo('_rels/.rels')
        except KeyError:
            return False
        try:
            root_rels = ftg.zipfile.read('_rels/.rels')
        except RuntimeError:
            return False
        # parse the XML content
        # TODO: handle XML parsing exceptions
        elem_rels = ET.fromstring(root_rels)
        # check root:
        if elem_rels.tag != TAG_RELS:
            return False
        main_part = None
        for elem_rel in elem_rels.iter(tag=TAG_REL):
            rel_type = elem_rel.get(ATTR_REL_TYPE)
            log.debug('Relationship: type=%s target=%s' % (rel_type, elem_rel.get(ATTR_REL_TARGET)))
            if rel_type in (URL_REL_OFFICEDOC, URL_REL_OFFICEDOC_STRICT, URL_REL_XPS):
                # TODO: is it useful to distinguish normal and strict OpenXML?
                main_part = elem_rel.get(ATTR_REL_TARGET)
                # TODO: raise anomaly if there are more than one rel with type office doc
                break
        log.debug('Main part: %s' % main_part)
        # if main_part is not None:
        #     try:
        #         main_part_xml = ftg.zipfile.read(main_part)
        #     except RuntimeError:
        #         return False
        #     # TODO: handle XML parsing exceptions
        #     elem_main_part = ET.fromstring(main_part_xml)
        #     #print(elem_main_part.tag)
        #     # Save XML tag of main part to determine actual format
        #     ftg.main_part_xmltag = elem_main_part.tag
        # else:
        #     # TODO: log error, raise anomaly (or maybe it's the case for XPS?)
        #     return False
        if main_part is None:
            # just warn but do not raise an exception. This might be just
            # another strange data type out there that we do not understand
            # yet. Return False so file type will stay FType_Generic_OpenXML
            log.warning('Failed to find any known relationship in OpenXML-file')
            # TODO: here we should recognize a generic OpenXML type instead of returning False
            return False

        # parse content types, find content type of main part
        try:
            content_types = ftg.zipfile.read('[Content_Types].xml')
        except RuntimeError:
            return False
        # parse the XML content
        # TODO: handle XML parsing exceptions
        elem_ctypes = ET.fromstring(content_types)
        ctypes_ext = {}
        ctypes_part = {}
        for elem_ext in elem_ctypes.iter(tag = TAG_CTYPES_DEFAULT):
            extension = elem_ext.get('Extension')
            content_type = elem_ext.get('ContentType')
            # print('Ext: %s => Content-type: %s' % (extension, content_type))
            if extension is not None and content_type is not None:
                ctypes_ext[extension] = content_type
        for elem_part in elem_ctypes.iter(tag = TAG_CTYPES_OVERRIDE):
            partname = elem_part.get('PartName')
            # remove leading slash if present
            partname = partname.lstrip('/')
            content_type = elem_part.get('ContentType')
            # print('Part: %s => Content-type: %s' % (partname, content_type))
            if partname is not None and content_type is not None:
                ctypes_part[partname] = content_type
        # find content-type of the main part, first by part name, second by extension:
        main_part_content_type = None
        if main_part in ctypes_part:
            main_part_content_type = ctypes_part[main_part]
        else:
            # extract extension from part name, without leading dot
            main_part_ext = os.path.splitext(main_part)[1][1:]
            if main_part_ext in ctypes_ext:
                main_part_content_type = ctypes_ext[main_part_ext]
        ftg.main_part_content_type = main_part_content_type
        log.debug('Main part content-type: %s' % main_part_content_type)
        return True


# --- WORD Formats ---

class FType_Word(FType_Base):
    '''Base class for all MS Word file types'''
    application = APP.MSWORD
    name = 'MS Word (generic)'
    longname = 'MS Word Document or Template (generic)'

class FType_Word97(FType_OLE_CLSID_Base, FType_Word):
    application = APP.MSWORD
    filetype = FTYPE.WORD97
    name = 'MS Word 97 Document'
    longname = 'MS Word 97-2003 Document or Template'
    CLSIDS = ('00020906-0000-0000-C000-000000000046',)
    extensions = ['doc', 'dot']
    content_types = ['application/msword']
    PUID = 'fmt/40'
    may_contain_vba = True
    may_contain_ole = True
    # TODO: if no CLSID, check stream 'WordDocument'

class FType_Word6(FType_OLE_CLSID_Base, FType_Word):
    application = APP.MSWORD
    filetype = FTYPE.WORD6
    name = 'MS Word 6 Document'
    longname = 'MS Word 6-7 Document or Template'
    CLSIDS = ('00020900-0000-0000-C000-000000000046',)
    extensions = ['doc', 'dot']
    content_types = ['application/msword']
    PUID = 'fmt/39'
    may_contain_ole = True

class FType_Word2007_Base(FType_Generic_OpenXML, FType_Word):
    application = APP.MSWORD
    name = 'MS Word 2007+ File'
    longname = 'MS Word 2007+ File (.doc?)'


class FType_Word2007(FType_Word2007_Base):
    application = APP.MSWORD
    filetype = FTYPE.WORD2007_DOCX
    name = 'MS Word 2007+ Document'
    longname = 'MS Word 2007+ Document (.docx)'
    extensions = ['docx']

class FType_Word2007_Macro(FType_Word2007_Base):
    application = APP.MSWORD
    filetype = FTYPE.WORD2007_DOCM
    name = 'MS Word 2007+ Macro-Enabled Document'
    longname = 'MS Word 2007+ Macro-Enabled Document (.docm)'
    extensions = ['docm']

class FType_Word2007_Template(FType_Word2007_Base):
    application = APP.MSWORD
    filetype = FTYPE.WORD2007_DOTX
    name = 'MS Word 2007+ Template'
    longname = 'MS Word 2007+ Template (.dotx)'
    extensions = ['dotx']

class FType_Word2007_Template_Macro(FType_Word2007_Base):
    application = APP.MSWORD
    filetype = FTYPE.WORD2007_DOTM
    name = 'MS Word 2007+ Macro-Enabled Template'
    longname = 'MS Word 2007+ Macro-Enabled Template (.dotm)'
    extensions = ['dotm']

# --- EXCEL Formats ---

class FType_Excel(FType_Base):
    '''Base class for all MS Excel file types'''
    application = APP.MSEXCEL
    name = 'MS Excel (generic)'
    longname = 'MS Excel Workbook/Template/Add-in (generic)'

class FType_Excel97(FType_Excel, FType_Generic_OLE):
    filetype = FTYPE.EXCEL97
    name = 'MS Excel 97 Workbook'
    longname = 'MS Excel 97-2003 Workbook or Template'
    CLSIDS = ('00020820-0000-0000-C000-000000000046',)
    extensions = ['xls', 'xlt', 'xla']
    # TODO: if no CLSID, check stream 'Workbook' or 'Book' (maybe Excel 5)

class FType_Excel5(FType_Excel, FType_Generic_OLE):
    filetype = FTYPE.EXCEL5
    name = 'MS Excel 5.0/95 Workbook'
    longname = 'MS Excel 5.0/95 Workbook, Template or Add-in'
    CLSIDS = ('00020810-0000-0000-C000-000000000046',)
    extensions = ['xls', 'xlt', 'xla']
    # TODO: this CLSID is also used in Excel addins (.xla) saved by MS Excel 365

class FType_Excel2007(FType_Excel, FType_Generic_OpenXML):
    '''Base class for all MS Excel 2007 file types'''
    name = 'MS Excel 2007+ (generic)'
    longname = 'MS Excel 2007+ Workbook or Template (generic)'
    content_types = ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',)
      # note: content type differs only for xlsm

class FType_Excel2007_XLSX (FType_Excel2007):
    filetype = FTYPE.EXCEL2007_XLSX
    name = 'MS Excel 2007+ Workbook'
    longname = 'MS Excel 2007+ Workbook (.xlsx)'
    extensions = ['xlsx']
    PUID = 'fmt/214'

class FType_Excel2007_XLSM (FType_Excel2007):
    filetype = FTYPE.EXCEL2007_XLSM
    name = 'MS Excel 2007+ Macro-Enabled Workbook'
    longname = 'MS Excel 2007+ Macro-Enabled Workbook (.xlsm)'
    extensions = ['xlsm']
    content_types = ('application/vnd.ms-excel.sheet.macroEnabled.12',)
    PUID = 'fmt/445'

class FType_Excel2007_XLSB (FType_Excel2007):
    filetype = FTYPE.EXCEL2007_XLSB
    name = 'MS Excel 2007+ Binary Workbook'
    longname = 'MS Excel 2007+ Binary Workbook (.xlsb)'
    extensions = ['xlsb']
    content_types = ('application/vnd.ms-excel.sheet.binary.macroEnabled.12',)
    PUID = 'fmt/595'

class FType_Excel2007_Template(FType_Excel2007):
    filetype = FTYPE.EXCEL2007_XLTX
    name = 'MS Excel 2007+ Template'
    longname = 'MS Excel 2007+ Template (.xltx)'
    extensions = ['xltx']

class FType_Excel2007_Template_Macro(FType_Excel2007):
    filetype = FTYPE.EXCEL2007_XLTM
    name = 'MS Excel 2007+ Macro-Enabled Template'
    longname = 'MS Excel 2007+ Macro-Enabled Template (.xltm)'
    extensions = ['xltm']

class FType_Excel2007_Addin_Macro(FType_Excel2007):
    filetype = FTYPE.EXCEL2007_XLAM
    name = 'MS Excel 2007+ Macro-Enabled Add-in'
    longname = 'MS Excel 2007+ Macro-Enabled Add-in (.xlam)'
    extensions = ['xlam']

# --- POWERPOINT Formats ---

class FType_Powerpoint(FType_Base):
    '''Base class for all MS Powerpoint file types'''
    application = APP.MSPOWERPOINT
    name = 'MS Powerpoint (generic)'
    longname = 'MS Powerpoint Presentation/Slideshow/Template/Addin/... (generic)'

class FType_Powerpoint97(FType_Powerpoint, FType_Generic_OLE):
    # see also: ppt_record_parser.is_ppt
    filetype = FTYPE.POWERPOINT97
    name = 'MS Powerpoint 97 Presentation'
    longname = 'MS Powerpoint 97-2003 Presentation/Slideshow/Template'
    CLSIDS = ('64818D10-4F9B-11CF-86EA-00AA00B929E8',)
    extensions = ['ppt', 'pps', 'pot']

class FType_Powerpoint2007(FType_Powerpoint, FType_Generic_OpenXML):
    '''Base class for all MS Powerpoint 2007 file types'''
    filetype = FTYPE.POWERPOINT2007
    name = 'MS Powerpoint 2007+ (generic)'
    longname = 'MS Powerpoint 2007+ Presentation/Slideshow/Template (generic)'
    content_types = ('application/vnd.openxmlformats-officedocument.presentationml.presentation',)

class FType_Powerpoint2007_Presentation(FType_Powerpoint2007):
    filetype = FTYPE.POWERPOINT2007_PPTX
    name = 'MSPointpoint 2007+ Presentation'
    longname = 'MSPointpoint 2007+ Presentation (.pptx)'
    extensions = ['pptx']

class FType_Powerpoint2007_Slideshow(FType_Powerpoint2007):
    filetype = FTYPE.POWERPOINT2007_PPSX
    name = 'MSPointpoint 2007+ Slideshow'
    longname = 'MSPointpoint 2007+ Slideshow (.ppsx)'
    extensions = ['ppsx']

class FType_Powerpoint2007_Macro(FType_Powerpoint2007):
    filetype = FTYPE.POWERPOINT2007_PPTM
    name = 'MSPointpoint 2007+ Macro-Enabled Presentation'
    longname = 'MSPointpoint 2007+ Macro-Enabled Presentation (.pptm)'
    extensions = ['pptm']

class FType_Powerpoint2007_Slideshow_Macro(FType_Powerpoint2007):
    filetype = FTYPE.POWERPOINT2007_PPSM
    name = 'MSPointpoint 2007+ Macro-Enabled Slideshow'
    longname = 'MSPointpoint 2007+ Macro-Enabled Slideshow (.ppsm)'
    extensions = ['ppsm']


class FType_XPS(FType_Generic_OpenXML):
    application = APP.WINDOWS
    filetype = FTYPE.XPS
    name = 'XPS'
    longname = 'Fixed-Page Document (.xps)',
    extensions = ['xps']


# TODO: for PPT, check for stream 'PowerPoint Document'
# TODO: for Visio, check for stream 'VisioDocument'

clsid_ftypes = {
    # mapping from CLSID of root storage to FType classes:
    # TODO: do not repeat magic numbers, import from oletools.common.clsid
    # WORD
    '00020906-0000-0000-C000-000000000046': FType_Word97,
    '00020900-0000-0000-C000-000000000046': FType_Word6,
    # EXCEL
    '00020820-0000-0000-C000-000000000046': FType_Excel97,
    '00020810-0000-0000-C000-000000000046': FType_Excel5,
    # POWERPOINT
    '64818D10-4F9B-11CF-86EA-00AA00B929E8': FType_Powerpoint97,
}

openxml_ftypes = {
    # mapping from content-type of main part to FType classes:
    # WORD
    'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml': FType_Word2007,
    'application/vnd.ms-word.document.macroEnabled.main+xml': FType_Word2007_Macro,
    'application/vnd.openxmlformats-officedocument.wordprocessingml.template.main+xml': FType_Word2007_Template,
    'application/vnd.ms-word.template.macroEnabledTemplate.main+xml': FType_Word2007_Template_Macro,
    # EXCEL
    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml': FType_Excel2007_XLSX,
    'application/vnd.ms-excel.sheet.macroEnabled.main+xml': FType_Excel2007_XLSM,
    'application/vnd.ms-excel.sheet.binary.macroEnabled.main': FType_Excel2007_XLSB,
    'application/vnd.openxmlformats-officedocument.spreadsheetml.template.main+xml': FType_Excel2007_Template,
    'application/vnd.ms-excel.template.macroEnabled.main+xml': FType_Excel2007_Template_Macro,
    'application/vnd.ms-excel.addin.macroEnabled.main+xml': FType_Excel2007_Addin_Macro,
    # POWERPOINT
    'application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml': FType_Powerpoint2007_Presentation,
    'application/vnd.openxmlformats-officedocument.presentationml.slideshow.main+xml': FType_Powerpoint2007_Slideshow,
    'application/vnd.ms-powerpoint.presentation.macroEnabled.main+xml': FType_Powerpoint2007_Macro,
    'application/vnd.ms-powerpoint.slideshow.macroEnabled.main+xml': FType_Powerpoint2007_Slideshow_Macro,
    # XPS
    'application/vnd.ms-package.xps-fixeddocumentsequence+xml': FType_XPS,
}


class FType_EXE_PE (FType_Base):
    filetype = FTYPE.EXE_PE
    container = CONTAINER.BINARY
    application = APP.WINDOWS
    name = "Windows PE Executable or DLL"
    longname = "Windows Portable Executable or DLL (EXE,DLL)"
    extensions = ('exe', 'dll', 'sys', 'scr')  # TODO: add more from https://en.wikipedia.org/wiki/Portable_Executable
    content_types = ('application/vnd.microsoft.portable-executable',)
    PUID = 'fmt/899'

    @classmethod
    def recognize(cls, ftg):
        return True if ftg.data.startswith(b'MZ') else False
        # TODO: make this more accurate by checking the PE header, e.g. using pefile or directly

class FileTypeGuesser(object):
    """
    A class to guess the type of a file, focused on MS Office, RTF and ZIP.
    """

    def __init__(self, filepath=None, data=None):
        self.filepath = filepath
        self.data = data
        self.container = None
        self.application = None
        self.filetype = None
        self.ftype = FType_Unknown  # FType class
        self.data_bytesio = None
        # For OLE:
        self.olefile = None
        self.root_clsid = None
        self.root_clsid_name = None
        # For ZIP:
        self.zipfile = None
        # For OpenXML:
        self.root_rels = None
        # self.main_part_xmltag = None
        self.main_part_content_type = None
        # For XML:
        self.root_xmltag = None
        self.xmlroot = None

        if filepath is None and data is None:
            raise ValueError('FileTypeGuesser requires either a file path or file data, or both')
        if data is None:
            with open(filepath, 'rb') as f:
                self.data = f.read()
        self.data_bytesio = io.BytesIO(self.data)

        # Identify the main container type:
        for ftype in (FType_RTF, FType_Generic_OLE, FType_Generic_Zip):
            if ftype.recognize(self):
                self.ftype = ftype
                break
        self.container = self.ftype.container
        self.filetype = self.ftype.filetype
        self.application = self.ftype.application

        # OLE file types:
        if self.container == CONTAINER.OLE:
            # for ftype in (FType_Word97, FType_Word6, FType_Excel97, FType_Excel5):
            #     if ftype.recognize(self):
            #         self.ftype = ftype
            #         break
            ft = clsid_ftypes.get(self.root_clsid, None)
            if ft is not None:
                self.ftype = ft

        # OpenXML file types:
        if self.container == CONTAINER.ZIP:
            if FType_Generic_OpenXML.recognize(self):
                self.ftype = FType_Generic_OpenXML
                ft = openxml_ftypes.get(self.main_part_content_type, None)
                if ft is not None:
                    self.ftype = ft

        # TODO: use a mapping from magic to file types
        if self.container == CONTAINER.UNKNOWN:
            if FType_EXE_PE.recognize(self):
                self.ftype = FType_EXE_PE

        self.container = self.ftype.container
        self.filetype = self.ftype.filetype
        self.application = self.ftype.application

    def __str__(self):
        """Give a short string representation of this object."""
        return '[FileTypeGuesser for {0}: {1} from {2} in {3}]'.format(
            "data" if self.filepath is None
            else os.path.basename(self.filepath),
            self.filetype, self.application, self.container)

    def close(self):
        """
        This method must be called at the end of processing
        """
        # TODO: only close self.olefile if it was opened by ftguess
        if self.zipfile is not None:
            self.zipfile.close()

    def is_ole(self):
        """
        Shortcut to check if the container is OLE
        :return: bool
        """
        return issubclass(self.ftype, FType_Generic_OLE) or self.container == CONTAINER.OLE

    def is_openxml(self):
        """
        Shortcut to check if the container is OpenXML
        :return: bool
        """
        return issubclass(self.ftype, FType_Generic_OpenXML) or self.container == CONTAINER.OpenXML

    def is_word(self):
        """
        Shortcut to check if a file is an Excel workbook, template or add-in
        :return: bool
        """
        return issubclass(self.ftype, FType_Word)

    def is_excel(self):
        """
        Shortcut to check if a file is an Excel workbook, template or add-in
        :return: bool
        """
        return issubclass(self.ftype, FType_Excel)

    def is_powerpoint(self):
        """
        Shortcut to check if a file is Powerpoint file of any kind
        :return: bool
        """
        return issubclass(self.ftype, FType_Powerpoint)


# === FUNCTIONS ==============================================================

def ftype_guess(filepath=None, data=None):
    return FileTypeGuesser(filepath, data)

def process_file(container, filename, data):
    print('File       : %s' % filename)
    ftg = ftype_guess(filepath=filename, data=data)
    print('File Type  : %s' % ftg.ftype.name)
    print('Description: %s' % ftg.ftype.longname)
    print('Application: %s' % ftg.ftype.application)
    print('Container  : %s' % ftg.container)
    if ftg.root_clsid is not None:
        print('Root CLSID : %s - %s' % (ftg.root_clsid, ftg.root_clsid_name))
    print('Content-type(s) : %s' % ','.join(ftg.ftype.content_types))
    print('PUID       : %s' % ftg.ftype.PUID)
    print()


#=== MAIN =================================================================

def main():
    # print banner with version
    python_version = '%d.%d.%d' % sys.version_info[0:3]
    print ('ftguess %s on Python %s - http://decalage.info/python/oletools' %
           (__version__, python_version))
    print ('THIS IS WORK IN PROGRESS - Check updates regularly!')
    print ('Please report any issue at https://github.com/decalage2/oletools/issues')
    print ('')

    DEFAULT_LOG_LEVEL = "warning" # Default log level
    LOG_LEVELS = {
        'debug':    logging.DEBUG,
        'info':     logging.INFO,
        'warning':  logging.WARNING,
        'error':    logging.ERROR,
        'critical': logging.CRITICAL
        }

    usage = 'usage: %prog [options] <filename> [filename2 ...]'
    parser = optparse.OptionParser(usage=usage)
    # parser.add_option('-c', '--csv', dest='csv',
    #     help='export results to a CSV file')
    parser.add_option("-r", action="store_true", dest="recursive",
        help='find files recursively in subdirectories.')
    parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None,
        help='if the file is a zip archive, open first file from it, using the provided password')
    parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*',
        help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)')
    parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL,
                            help="logging level debug/info/warning/error/critical (default=%default)")

    (options, args) = parser.parse_args()

    # Print help if no arguments are passed
    if len(args) == 0:
        print (__doc__)
        parser.print_help()
        sys.exit()

    # Setup logging to the console:
    # here we use stdout instead of stderr by default, so that the output
    # can be redirected properly.
    logging.basicConfig(level=LOG_LEVELS[options.loglevel], stream=sys.stdout,
                        format='%(levelname)-8s %(message)s')
    # enable logging in the modules:
    enable_logging()

    for container, filename, data in xglob.iter_files(args, recursive=options.recursive,
        zip_password=options.zip_password, zip_fname=options.zip_fname):
        # ignore directory names stored in zip files:
        if container and filename.endswith('/'):
            continue
        process_file(container, filename, data)


if __name__ == '__main__':
    main()
