#!/usr/bin/env python
"""
rtfobj.py

rtfobj is a Python module to extract embedded objects from RTF files, such as
OLE ojects. It can be used as a Python library or a command-line tool.

Usage: rtfobj.py <file.rtf>

rtfobj project website: http://www.decalage.info/python/rtfobj

rtfobj is part of the python-oletools package:
http://www.decalage.info/python/oletools
"""

#=== LICENSE =================================================================

# rtfobj is copyright (c) 2012-2022, Philippe Lagadec (http://www.decalage.info)
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
#  * Redistributions of source code must retain the above copyright notice, this
#    list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#------------------------------------------------------------------------------
# CHANGELOG:
# 2012-11-09 v0.01 PL: - first version
# 2013-04-02 v0.02 PL: - fixed bug in main
# 2015-12-09 v0.03 PL: - configurable logging, CLI options
#                      - extract OLE 1.0 objects
#                      - extract files from OLE Package objects
# 2016-04-01 v0.04 PL: - fixed logging output to use stdout instead of stderr
# 2016-04-07 v0.45 PL: - improved parsing to handle some malware tricks
# 2016-05-06 v0.47 TJ: - added option -d to set the output directory
#                        (contribution by Thomas Jarosch)
#                  TJ: - sanitize filenames to avoid special characters
# 2016-05-29       PL: - improved parsing, fixed issue #42
# 2016-07-13 v0.50 PL: - new RtfParser and RtfObjParser classes
# 2016-07-18       SL: - added Python 3.5 support
# 2016-07-19       PL: - fixed Python 2.6-2.7 support
# 2016-07-30       PL: - new API with class RtfObject
#                      - backward-compatible API rtf_iter_objects (fixed issue #70)
# 2016-07-31       PL: - table output with tablestream
# 2016-08-01       PL: - detect executable filenames in OLE Package
# 2016-08-08       PL: - added option -s to save objects to files
# 2016-08-09       PL: - fixed issue #78, improved regex
# 2016-09-06       PL: - fixed issue #83, backward compatible API
# 2016-11-17 v0.51 PL: - updated call to oleobj.OleNativeStream
# 2017-03-12       PL: - fixed imports for Python 2+3
#                      - fixed hex decoding bug in RtfObjParser (issue #103)
# 2017-03-29       PL: - fixed RtfParser to handle issue #152 (control word with
#                        long parameter)
# 2017-04-11       PL: - added detection of the OLE2Link vulnerability CVE-2017-0199
# 2017-05-04       PL: - fixed issue #164 to handle linked OLE objects
# 2017-06-08       PL: - fixed issue/PR #143: bin object with negative length
# 2017-06-29       PL: - temporary fix for issue #178
# 2017-07-14 v0.52 PL: - disabled logging of each control word (issue #184)
# 2017-07-24       PL: - fixed call to RtfParser._end_of_file (issue #185)
#                      - ignore optional space after \bin (issue #185)
# 2017-09-06       PL: - fixed issue #196: \pxe is not a destination
# 2018-01-11       CH: - speedup RTF parsing (PR #244)
# 2018-02-01      JRM: - fixed issue #251: \bin without argument
# 2018-04-09       PL: - fixed issue #280: OLE Package were not detected on Python 3
# 2018-03-24 v0.53 PL: - fixed issue #292: \margSz is a destination
# 2018-04-27       PL: - extract and display the CLSID of OLE objects
# 2018-04-30       PL: - handle "\'" obfuscation trick - issue #281
# 2018-05-10       PL: - fixed issues #303 #307: several destination cwords were incorrect
# 2018-05-17       PL: - fixed issue #273: bytes constants instead of str
# 2018-05-31 v0.53.1 PP: - fixed issue #316: whitespace after \bin on Python 3
# 2018-06-22 v0.53.2 PL: - fixed issue #327: added "\pnaiu" & "\pnaiud"
# 2018-09-11 v0.54 PL: - olefile is now a dependency
# 2019-07-08 v0.55 MM: - added URL carver for CVE-2017-0199 (Equation Editor) PR #460
#                      - added SCT to the list of executable file extensions PR #461
# 2019-12-16 v0.55.2 PL: - \rtf is not a destination control word (issue #522)
# 2019-12-17         PL: - fixed process_file to detect Equation class (issue #525)
# 2021-05-06 v0.56.2 DD: - fixed bug when OLE package class name ends with null
#                          characters (issue #507, PR #648)
# 2021-05-23 v0.60   PL: - use ftguess to identify file type of OLE Package
#                        - fixed bug in re_executable_extensions
# 2021-06-03 v0.60.1 PL: - fixed code to find URLs in OLE2Link objects for Py3 (issue #692)

from __future__ import print_function

__version__ = '0.60.1'

# ------------------------------------------------------------------------------
# TODO:
# - allow semicolon within hex, as found in  this sample:
#   http://contagiodump.blogspot.nl/2011/10/sep-28-cve-2010-3333-manuscript-with.html
# TODO: use OleObject and OleNativeStream in RtfObject instead of copying each attribute
# TODO: option -e <id> to extract an object, -e all for all objects
# TODO: option to choose which destinations to include (objdata by default)
# TODO: option to display SHA256 or MD5 hashes of objects in table


# === IMPORTS =================================================================

import re, os, sys, binascii, logging, optparse, hashlib
import os.path
from time import time

# IMPORTANT: it should be possible to run oletools directly as scripts
# in any directory without installing them with pip or setup.py.
# In that case, relative imports are NOT usable.
# And to enable Python 2+3 compatibility, we need to use absolute imports,
# so we add the oletools parent folder to sys.path (absolute+normalized path):
_thismodule_dir = os.path.normpath(os.path.abspath(os.path.dirname(__file__)))
# print('_thismodule_dir = %r' % _thismodule_dir)
_parent_dir = os.path.normpath(os.path.join(_thismodule_dir, '..'))
# print('_parent_dir = %r' % _thirdparty_dir)
if not _parent_dir in sys.path:
    sys.path.insert(0, _parent_dir)

from oletools.thirdparty.xglob import xglob
from oletools.thirdparty.tablestream import tablestream
from oletools import oleobj, ftguess
import olefile
from oletools.common import clsid

# === LOGGING =================================================================

class NullHandler(logging.Handler):
    """
    Log Handler without output, to avoid printing messages if logging is not
    configured by the main application.
    Python 2.7 has logging.NullHandler, but this is necessary for 2.6:
    see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library
    """
    def emit(self, record):
        pass

def get_logger(name, level=logging.CRITICAL+1):
    """
    Create a suitable logger object for this module.
    The goal is not to change settings of the root logger, to avoid getting
    other modules' logs on the screen.
    If a logger exists with same name, reuse it. (Else it would have duplicate
    handlers and messages would be doubled.)
    The level is set to CRITICAL+1 by default, to avoid any logging.
    """
    # First, test if there is already a logger with the same name, else it
    # will generate duplicate messages (due to duplicate handlers):
    if name in logging.Logger.manager.loggerDict:
        #NOTE: another less intrusive but more "hackish" solution would be to
        # use getLogger then test if its effective level is not default.
        logger = logging.getLogger(name)
        # make sure level is OK:
        logger.setLevel(level)
        return logger
    # get a new logger:
    logger = logging.getLogger(name)
    # only add a NullHandler for this logger, it is up to the application
    # to configure its own logging:
    logger.addHandler(NullHandler())
    logger.setLevel(level)
    return logger

# a global logger object used for debugging:
log = get_logger('rtfobj')


#=== CONSTANTS=================================================================

# REGEX pattern to extract embedded OLE objects in hexadecimal format:

# alphanum digit: [0-9A-Fa-f]
HEX_DIGIT = b'[0-9A-Fa-f]'

# hex char = two alphanum digits: [0-9A-Fa-f]{2}
# HEX_CHAR = r'[0-9A-Fa-f]{2}'
# in fact MS Word allows whitespaces in between the hex digits!
# HEX_CHAR = r'[0-9A-Fa-f]\s*[0-9A-Fa-f]'
# Even worse, MS Word also allows ANY RTF-style tag {*} in between!!
# AND the tags can be nested...
#SINGLE_RTF_TAG = r'[{][^{}]*[}]'
# Actually RTF tags may contain braces escaped with backslash (\{ \}):
SINGLE_RTF_TAG = b'[{](?:\\\\.|[^{}\\\\])*[}]'

# Nested tags, two levels (because Python's re does not support nested matching):
# NESTED_RTF_TAG = r'[{](?:[^{}]|'+SINGLE_RTF_TAG+r')*[}]'
NESTED_RTF_TAG = b'[{](?:\\\\.|[^{}\\\\]|'+SINGLE_RTF_TAG+b')*[}]'

# AND it is also allowed to insert ANY control word or control symbol (ignored)
# According to Rich Text Format (RTF) Specification Version 1.9.1,
# section "Control Word":
# control word = \<ASCII Letter [a-zA-Z] Sequence max 32><Delimiter>
# delimiter = space, OR signed integer followed by any non-digit,
#             OR any character except letter and digit
# examples of valid control words:
# "\AnyThing " "\AnyThing123z" ""\AnyThing-456{" "\AnyThing{"
# control symbol = \<any char except letter or digit> (followed by anything)

ASCII_NAME = b'([a-zA-Z]{1,250})'

# using Python's re lookahead assumption:
# (?=...) Matches if ... matches next, but doesn't consume any of the string.
# This is called a lookahead assertion. For example, Isaac (?=Asimov) will
# match 'Isaac ' only if it's followed by 'Asimov'.

# TODO: Find the actual limit on the number of digits for Word
# SIGNED_INTEGER = r'(-?\d{1,250})'
SIGNED_INTEGER = b'(-?\\d+)'

# Note for issue #78: need to match "\A-" not followed by digits (or the end of string)
CONTROL_WORD = b'(?:\\\\' + ASCII_NAME + b'(?:' + SIGNED_INTEGER + b'(?=[^0-9])|(?=[^a-zA-Z0-9])|$))'

re_control_word = re.compile(CONTROL_WORD)

# Note for issue #78: need to match "\" followed by digit (any non-alpha)
CONTROL_SYMBOL = b'(?:\\\\[^a-zA-Z])'
re_control_symbol = re.compile(CONTROL_SYMBOL)

# Text that is not a control word/symbol or a group:
TEXT = b'[^{}\\\\]+'
re_text = re.compile(TEXT)

# ignored whitespaces and tags within a hex block:
IGNORED = b'(?:\\s|'+NESTED_RTF_TAG+b'|'+CONTROL_SYMBOL+b'|'+CONTROL_WORD+b')*'
#IGNORED = r'\s*'

# HEX_CHAR = HEX_DIGIT + IGNORED + HEX_DIGIT

# several hex chars, at least 4: (?:[0-9A-Fa-f]{2}){4,}
# + word boundaries
# HEX_CHARS_4orMORE = r'\b(?:' + HEX_CHAR + r'){4,}\b'
# at least 1 hex char:
# HEX_CHARS_1orMORE = r'(?:' + HEX_CHAR + r')+'
# at least 1 hex char, followed by whitespace or CR/LF:
# HEX_CHARS_1orMORE_WHITESPACES = r'(?:' + HEX_CHAR + r')+\s+'
# + word boundaries around hex block
# HEX_CHARS_1orMORE_WHITESPACES = r'\b(?:' + HEX_CHAR + r')+\b\s*'
# at least one block of hex and whitespace chars, followed by closing curly bracket:
# HEX_BLOCK_CURLY_BRACKET = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')+\}'
# PATTERN = r'(?:' + HEX_CHARS_1orMORE_WHITESPACES + r')*' + HEX_CHARS_1orMORE

#TODO PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}\b'
# PATTERN = r'\b(?:' + HEX_CHAR + IGNORED + r'){4,}' #+ HEX_CHAR + r'\b'
PATTERN = b'\\b(?:' + HEX_DIGIT + IGNORED + b'){7,}' + HEX_DIGIT + b'\\b'

# at least 4 hex chars, followed by whitespace or CR/LF: (?:[0-9A-Fa-f]{2}){4,}\s*
# PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}'
# improved pattern, allowing semicolons within hex:
#PATTERN = r'(?:(?:[0-9A-Fa-f]{2})+\s*)*(?:[0-9A-Fa-f]{2}){4,}'

re_hexblock = re.compile(PATTERN)
re_embedded_tags = re.compile(IGNORED)
re_decimal = re.compile(b'\\d+')

re_delimiter = re.compile(b'[ \\t\\r\\n\\f\\v]')

DELIMITER = b'[ \\t\\r\\n\\f\\v]'
DELIMITERS_ZeroOrMore = b'[ \\t\\r\\n\\f\\v]*'
BACKSLASH_BIN = b'\\\\bin'
# According to my tests, Word accepts up to 250 digits (leading zeroes)
DECIMAL_GROUP = b'(\d{1,250})'

re_delims_bin_decimal = re.compile(DELIMITERS_ZeroOrMore + BACKSLASH_BIN
                                   + DECIMAL_GROUP + DELIMITER)
re_delim_hexblock = re.compile(DELIMITER + PATTERN)

# TODO: use a frozenset instead of a regex?
re_executable_extensions = re.compile(
    r"(?i)\.(BAT|CLASS|CMD|CPL|DLL|EXE|COM|GADGET|HTA|INF|JAR|JS|JSE|LNK|MSC|MSI|MSP|PIF|PS1|PS1XML|PS2|PS2XML|PSC1|PSC2|REG|SCF|SCR|SCT|VB|VBE|VBS|WS|WSC|WSF|WSH)\b")

# Destination Control Words, according to MS RTF Specifications v1.9.1:
DESTINATION_CONTROL_WORDS = frozenset((
    b"aftncn", b"aftnsep", b"aftnsepc", b"annotation", b"atnauthor", b"atndate", b"atnid", b"atnparent", b"atnref",
    b"atrfend", b"atrfstart", b"author", b"background", b"bkmkend", b"bkmkstart", b"blipuid", b"buptim", b"category",
    b"colorschememapping", b"colortbl", b"comment", b"company", b"creatim", b"datafield", b"datastore", b"defchp", b"defpap",
    b"do", b"doccomm", b"docvar", b"dptxbxtext", b"ebcend", b"ebcstart", b"factoidname", b"falt", b"fchars", b"ffdeftext",
    b"ffentrymcr", b"ffexitmcr", b"ffformat", b"ffhelptext", b"ffl", b"ffname",b"ffstattext", b"field", b"file", b"filetbl",
    b"fldinst", b"fldrslt", b"fldtype", b"fontemb", b"fonttbl", b"footer", b"footerf", b"footerl",
    b"footerr", b"footnote", b"formfield", b"ftncn", b"ftnsep", b"ftnsepc", b"g", b"generator", b"gridtbl", b"header", b"headerf",
    b"headerl", b"headerr", b"hl", b"hlfr", b"hlinkbase", b"hlloc", b"hlsrc", b"hsv", b"info", b"keywords",
    b"latentstyles", b"lchars", b"levelnumbers", b"leveltext", b"lfolevel", b"linkval", b"list", b"listlevel", b"listname",
    b"listoverride", b"listoverridetable", b"listpicture", b"liststylename", b"listtable", b"listtext", b"lsdlockedexcept",
    b"macc", b"maccPr", b"mailmerge", b"malnScr", b"manager", b"margPr", b"mbar", b"mbarPr", b"mbaseJc", b"mbegChr",
    b"mborderBox", b"mborderBoxPr", b"mbox", b"mboxPr", b"mchr", b"mcount", b"mctrlPr", b"md", b"mdeg", b"mdegHide", b"mden",
    b"mdiff", b"mdPr", b"me", b"mendChr", b"meqArr", b"meqArrPr", b"mf", b"mfName", b"mfPr", b"mfunc", b"mfuncPr",b"mgroupChr",
    b"mgroupChrPr",b"mgrow", b"mhideBot", b"mhideLeft", b"mhideRight", b"mhideTop", b"mlim", b"mlimLoc", b"mlimLow",
    b"mlimLowPr", b"mlimUpp", b"mlimUppPr", b"mm", b"mmaddfieldname", b"mmathPict", b"mmaxDist", b"mmc",
    b"mmcJc", b"mmconnectstr", b"mmconnectstrdata", b"mmcPr", b"mmcs", b"mmdatasource", b"mmheadersource", b"mmmailsubject",
    b"mmodso", b"mmodsofilter", b"mmodsofldmpdata", b"mmodsomappedname", b"mmodsoname", b"mmodsorecipdata", b"mmodsosort",
    b"mmodsosrc", b"mmodsotable", b"mmodsoudl", b"mmodsoudldata", b"mmodsouniquetag", b"mmPr", b"mmquery", b"mmr", b"mnary",
    b"mnaryPr", b"mnoBreak", b"mnum", b"mobjDist", b"moMath", b"moMathPara", b"moMathParaPr", b"mopEmu", b"mphant", b"mphantPr",
    b"mplcHide", b"mpos", b"mr", b"mrad", b"mradPr", b"mrPr", b"msepChr", b"mshow", b"mshp", b"msPre", b"msPrePr", b"msSub",
    b"msSubPr", b"msSubSup", b"msSubSupPr",  b"msSup", b"msSupPr", b"mstrikeBLTR", b"mstrikeH", b"mstrikeTLBR", b"mstrikeV",
    b"msub", b"msubHide", b"msup", b"msupHide", b"mtransp", b"mtype", b"mvertJc", b"mvfmf", b"mvfml", b"mvtof", b"mvtol",
    b"mzeroAsc", b"mzeroDesc", b"mzeroWid", b"nesttableprops", b"nonesttables", b"objalias", b"objclass",
    b"objdata", b"object", b"objname", b"objsect", b"oldcprops", b"oldpprops", b"oldsprops", b"oldtprops",
    b"oleclsid", b"operator", b"panose", b"password", b"passwordhash", b"pgp", b"pgptbl", b"picprop", b"pict", b"pn", b"pnseclvl",
    b"pntext", b"pntxta", b"pntxtb", b"printim",
    b"propname", b"protend", b"protstart", b"protusertbl",
    b"result", b"revtbl", b"revtim",
    # \rtf should not be treated as a destination (issue #522)
    #b"rtf",
    b"rxe", b"shp", b"shpgrp", b"shpinst", b"shppict", b"shprslt", b"shptxt",
    b"sn", b"sp", b"staticval", b"stylesheet", b"subject", b"sv", b"svb", b"tc", b"template", b"themedata", b"title", b"txe", b"ud",
    b"upr", b"userprops", b"wgrffmtfilter", b"windowcaption", b"writereservation", b"writereservhash", b"xe", b"xform",
    b"xmlattrname", b"xmlattrvalue", b"xmlclose", b"xmlname", b"xmlnstbl", b"xmlopen",
    # added for issue #292: https://github.com/decalage2/oletools/issues/292
    b"margSz",
    # added for issue #327:
    b"pnaiu", b"pnaiud",

    # It seems \private should not be treated as a destination (issue #178)
    # Same for \pxe (issue #196)
    # b"private", b"pxe",
    # from issue #303: These destination control words can be treated as a "value" type.
    # They don't consume data so they won't change the state of the parser.
    # b"atnicn", b"atntime", b"fname", b"fontfile", b"htmltag", b"keycode", b"maln",
    # b"mhtmltag", b"mmath", b"mmathPr", b"nextfile", b"objtime", b"rsidtbl",
    ))


# some str methods on Python 2.x return characters,
# while the equivalent bytes methods return integers on Python 3.x:
if sys.version_info[0] <= 2:
    # Python 2.x - Characters (str)
    BACKSLASH = '\\'
    BRACE_OPEN = '{'
    BRACE_CLOSE = '}'
    UNICODE_TYPE = unicode
else:
    # Python 3.x - Integers
    BACKSLASH = ord('\\')
    BRACE_OPEN = ord('{')
    BRACE_CLOSE = ord('}')
    UNICODE_TYPE = str

RTF_MAGIC = b'\x7b\\rt'   # \x7b == b'{' but does not mess up auto-indent


def duration_str(duration):
    """ create a human-readable string representation of duration [s] """
    value = duration
    unit = 's'
    if value > 90:
        value /= 60.
        unit = 'min'
        if value > 90:
            value /= 60.
            unit = 'h'
            if value > 72:
                value /= 24.
                unit = 'days'
    return '{0:.1f}{1}'.format(value, unit)


#=== CLASSES =================================================================

class Destination(object):
    """
    Stores the data associated with a destination control word
    """
    def __init__(self, cword=None):
        self.cword = cword
        self.data = b''
        self.start = None
        self.end = None
        self.group_level = 0


# class Group(object):
#     """
#     Stores the data associated with a group between braces {...}
#     """
#     def __init__(self, cword=None):
#         self.start = None
#         self.end = None
#         self.level = None



class RtfParser(object):
    """
    Very simple but robust generic RTF parser, designed to handle
    malformed malicious RTF as MS Word does
    """

    def __init__(self, data):
        """
        RtfParser constructor.
        
        :param data: bytes object containing the RTF data to be parsed 
        """
        self.data = data
        self.index = 0
        self.size = len(data)
        self.group_level = 0
        # default destination for the document text:
        document_destination = Destination()
        self.destinations = [document_destination]
        self.current_destination = document_destination

    def _report_progress(self, start_time):
        """ report progress on parsing at regular intervals """
        now = float(time())
        if now == start_time or self.size == 0:
            return   # avoid zero-division
        percent_done = 100. * self.index / self.size
        time_per_index = (now - start_time) / float(self.index)
        finish_estim = float(self.size - self.index) * time_per_index

        log.debug('After {0} finished {1:4.1f}% of current file ({2} bytes); '
                  'will finish in approx {3}'
                  .format(duration_str(now-start_time), percent_done,
                          self.size, duration_str(finish_estim)))

    def parse(self):
        """
        Parse the RTF data
        
        :return: nothing
        """
        # Start at beginning of data
        self.index = 0
        start_time = time()
        last_report = start_time
        # Loop until the end
        while self.index < self.size:
            if time() - last_report > 15:     # report every 15s
                self._report_progress(start_time)
                last_report = time()
            if self.data[self.index] == BRACE_OPEN:
                # Found an opening brace "{": Start of a group
                self._open_group()
                self.index += 1
                continue
            if self.data[self.index] == BRACE_CLOSE:
                # Found a closing brace "}": End of a group
                self._close_group()
                self.index += 1
                continue
            if self.data[self.index] == BACKSLASH:
                # Found a backslash "\": Start of a control word or control symbol
                # Use a regex to extract the control word name if present:
                # NOTE: the full length of the control word + its optional integer parameter
                # is limited by MS Word at 253 characters, so we have to run the regex
                # on a cropped string:
                data_cropped = self.data[self.index:self.index+254]
                # append a space so that the regex can check the following character:
                data_cropped += b' '
                # m = re_control_word.match(self.data, self.index, self.index+253)
                m = re_control_word.match(data_cropped)
                if m:
                    cword = m.group(1)
                    param = None
                    if len(m.groups()) > 1:
                        param = m.group(2)
                    # log.debug('control word at index %Xh - cword=%r param=%r  %r' % (self.index, cword, param, m.group()))
                    self._control_word(m, cword, param)
                    self.index += len(m.group())
                    # if it's \bin, call _bin after updating index
                    if cword == b'bin':
                        self._bin(m, param)
                    continue
                # Otherwise, it may be a control symbol:
                m = re_control_symbol.match(self.data, self.index)
                if m:
                    self.control_symbol(m)
                    self.index += len(m.group())
                    continue
            # Otherwise, this is plain text:
            # Use a regex to match all characters until the next brace or backslash:
            m = re_text.match(self.data, self.index)
            if m:
                self._text(m)
                self.index += len(m.group())
                continue
            raise RuntimeError('Should not have reached this point - index=%Xh' % self.index)
        # call _end_of_file to make sure all groups are closed properly
        self._end_of_file()


    def _open_group(self):
        self.group_level += 1
        #log.debug('{ Open Group at index %Xh - level=%d' % (self.index, self.group_level))
        # call user method AFTER increasing the level:
        self.open_group()

    def open_group(self):
        #log.debug('open group at index %Xh' % self.index)
        pass

    def _close_group(self):
        #log.debug('} Close Group at index %Xh - level=%d' % (self.index, self.group_level))
        # call user method BEFORE decreasing the level:
        self.close_group()
        # if the destination level is the same as the group level, close the destination:
        if self.group_level == self.current_destination.group_level:
            # log.debug('Current Destination %r level = %d => Close Destination' % (
            #     self.current_destination.cword, self.current_destination.group_level))
            self._close_destination()
        else:
            # log.debug('Current Destination %r level = %d => Continue with same Destination' % (
            #     self.current_destination.cword, self.current_destination.group_level))
            pass
        self.group_level -= 1
        # log.debug('Decreased group level to %d' % self.group_level)

    def close_group(self):
        #log.debug('close group at index %Xh' % self.index)
        pass

    def _open_destination(self, matchobject, cword):
        # if the current destination is at the same group level, close it first:
        if self.current_destination.group_level == self.group_level:
            self._close_destination()
        new_dest = Destination(cword)
        new_dest.group_level = self.group_level
        self.destinations.append(new_dest)
        self.current_destination = new_dest
        # start of the destination is right after the control word:
        new_dest.start = self.index + len(matchobject.group())
        # log.debug("Open Destination %r start=%Xh - level=%d" % (cword, new_dest.start, new_dest.group_level))
        # call the corresponding user method for additional processing:
        self.open_destination(self.current_destination)

    def open_destination(self, destination):
        pass

    def _close_destination(self):
        # log.debug("Close Destination %r end=%Xh - level=%d" % (self.current_destination.cword,
        #     self.index, self.current_destination.group_level))
        self.current_destination.end = self.index
        # call the corresponding user method for additional processing:
        self.close_destination(self.current_destination)
        if len(self.destinations)>0:
            # remove the current destination from the stack, and go back to the previous one:
            self.destinations.pop()
        if len(self.destinations) > 0:
            self.current_destination = self.destinations[-1]
        else:
            # log.debug('All destinations are closed, keeping the document destination open')
            pass

    def close_destination(self, destination):
        pass

    def _control_word(self, matchobject, cword, param):
        #log.debug('control word %r at index %Xh' % (matchobject.group(), self.index))
        # TODO: according to RTF specs v1.9.1, "Destination changes are legal only immediately after an opening brace ({)"
        # (not counting the special control symbol \*, of course)
        if cword in DESTINATION_CONTROL_WORDS:
            log.debug('%r is a destination control word: starting a new destination at index %Xh' % (cword, self.index))
            self._open_destination(matchobject, cword)
        # call the corresponding user method for additional processing:
        self.control_word(matchobject, cword, param)

    def control_word(self, matchobject, cword, param):
        pass

    def control_symbol(self, matchobject):
        #log.debug('control symbol %r at index %Xh' % (matchobject.group(), self.index))
        pass

    def _text(self, matchobject):
        text = matchobject.group()
        self.current_destination.data += text
        self.text(matchobject, text)

    def text(self, matchobject, text):
        #log.debug('text %r at index %Xh' % (matchobject.group(), self.index))
        pass

    def _bin(self, matchobject, param):
        if param is None:
            log.info('Detected anti-analysis trick: \\bin object without length at index %X' % self.index)
            binlen = 0
        else:
            binlen = int(param)
        # handle negative length
        if binlen < 0:
            log.info('Detected anti-analysis trick: \\bin object with negative length at index %X' % self.index)
            # binlen = int(param.strip('-'))
            # According to my tests, if the bin length is negative,
            # it should be treated as a null length:
            binlen=0
        # ignore optional space after \bin
        if ord(self.data[self.index:self.index + 1]) == ord(' '):
            log.debug('\\bin: ignoring whitespace before data')
            self.index += 1
        log.debug('\\bin: reading %d bytes of binary data' % binlen)
        # TODO: handle length greater than data
        bindata = self.data[self.index:self.index + binlen]
        self.index += binlen
        self.bin(bindata)

    def bin(self, bindata):
        pass

    def _end_of_file(self):
        # log.debug('%Xh Reached End of File')
        # close any group/destination that is still open:
        while self.group_level > 0:
            log.debug('Group Level = %d, closing group' % self.group_level)
            self._close_group()
        self.end_of_file()

    def end_of_file(self):
        pass


class RtfObject(object):
    """
    An object or a file (OLE Package) embedded into an RTF document
    """
    def __init__(self):
        """
        RtfObject constructor
        """
        # start and end index in the RTF file:
        self.start = None
        self.end = None
        # raw object data encoded in hexadecimal, as found in the RTF file:
        self.hexdata = None
        # raw object data in binary form, decoded from hexadecimal
        self.rawdata = None
        # OLE object data (extracted from rawdata)
        self.is_ole = False
        self.oledata = None
        self.format_id = None
        self.class_name = None
        self.oledata_size = None
        # OLE Package data (extracted from oledata)
        self.is_package = False
        self.olepkgdata = None
        self.filename = None
        self.src_path = None
        self.temp_path = None
        self.ftg = None  # ftguess.FileTypeGuesser to identify file type
        # Additional OLE object data
        self.clsid = None
        self.clsid_desc = None




class RtfObjParser(RtfParser):
    """
    Specialized RTF parser to extract OLE objects
    """

    def __init__(self, data):
        super(RtfObjParser, self).__init__(data)
        # list of RtfObjects found
        self.objects = []

    def open_destination(self, destination):
        # TODO: detect when the destination is within an objdata, report as obfuscation
        if destination.cword == b'objdata':
            log.debug('*** Start object data at index %Xh' % destination.start)

    def close_destination(self, destination):
        if destination.cword == b'objdata':
            log.debug('*** Close object data at index %Xh' % self.index)
            rtfobj = RtfObject()
            self.objects.append(rtfobj)
            rtfobj.start = destination.start
            rtfobj.end = destination.end
            # Filter out all whitespaces first (just ignored):
            hexdata1 = destination.data.translate(None, b' \t\r\n\f\v')
            # Then filter out any other non-hex character:
            hexdata = re.sub(b'[^a-fA-F0-9]', b'', hexdata1)
            if len(hexdata) < len(hexdata1):
                # this is only for debugging:
                nonhex = re.sub(b'[a-fA-F0-9]', b'', hexdata1)
                log.debug('Found non-hex chars in hexdata: %r' % nonhex)
            # MS Word accepts an extra hex digit, so we need to trim it if present:
            if len(hexdata) & 1:
                log.debug('Odd length, trimmed last byte.')
                hexdata = hexdata[:-1]
            rtfobj.hexdata = hexdata
            object_data = binascii.unhexlify(hexdata)
            rtfobj.rawdata = object_data
            rtfobj.rawdata_md5 = hashlib.md5(object_data).hexdigest()                    
            # TODO: check if all hex data is extracted properly

            obj = oleobj.OleObject()
            try:
                obj.parse(object_data)
                rtfobj.format_id = obj.format_id
                rtfobj.class_name = obj.class_name
                rtfobj.oledata_size = obj.data_size
                rtfobj.oledata = obj.data
                rtfobj.oledata_md5 = hashlib.md5(obj.data).hexdigest()         
                rtfobj.is_ole = True
                if obj.class_name.lower().rstrip(b'\0') == b'package':
                    opkg = oleobj.OleNativeStream(bindata=obj.data,
                                                  package=True)
                    rtfobj.filename = opkg.filename
                    rtfobj.src_path = opkg.src_path
                    rtfobj.temp_path = opkg.temp_path
                    rtfobj.olepkgdata = opkg.data
                    rtfobj.olepkgdata_md5 = hashlib.md5(opkg.data).hexdigest()
                    # use ftguess to identify file type from content:
                    rtfobj.ftg = ftguess.FileTypeGuesser(data=rtfobj.olepkgdata)
                    rtfobj.is_package = True
                else:
                    if olefile.isOleFile(obj.data):
                        ole = olefile.OleFileIO(obj.data)
                        rtfobj.clsid = ole.root.clsid
                        rtfobj.clsid_desc = clsid.KNOWN_CLSIDS.get(rtfobj.clsid.upper(),
                            'unknown CLSID (please report at https://github.com/decalage2/oletools/issues)')
            except:
                pass
                log.debug('*** Not an OLE 1.0 Object')

    def bin(self, bindata):
        if self.current_destination.cword == b'objdata':
            # TODO: keep track of this, because it is unusual and indicates potential obfuscation
            # trick: hexlify binary data, add it to hex data
            self.current_destination.data += binascii.hexlify(bindata)

    def control_word(self, matchobject, cword, param):
        # TODO: extract useful cwords such as objclass
        # TODO: keep track of cwords inside objdata, because it is unusual and indicates potential obfuscation
        # TODO: same with control symbols, and opening bracket
        # log.debug('- Control word "%s", param=%s, level=%d' % (cword, param, self.group_level))
        pass

    def control_symbol(self, matchobject):
        # log.debug('control symbol %r at index %Xh' % (matchobject.group(), self.index))
        symbol = matchobject.group()[1:2]
        if symbol == b"'":
            # read the two hex digits following "\'" - which can be any characters, not just hex digits
            # (because within an objdata destination, they are simply ignored)
            hexdigits = self.data[self.index+2:self.index+4]
            # print(hexdigits)
            # move the index two bytes forward
            self.index += 2
            if self.current_destination.cword == b'objdata':
                # Here's the tricky part: there is a bug in the MS Word RTF parser at least
                # until Word 2016, that removes the last hex digit before the \'hh control
                # symbol, ONLY IF the number of hex digits read so far is odd.
                # So to emulate that bug, we have to clean the data read so far by keeping
                # only the hex digits:
                # Filter out any non-hex character:
                self.current_destination.data = re.sub(b'[^a-fA-F0-9]', b'', self.current_destination.data)
                if len(self.current_destination.data) & 1 == 1:
                    # If the number of hex digits is odd, remove the last one:
                    self.current_destination.data = self.current_destination.data[:-1]


#=== FUNCTIONS ===============================================================

def rtf_iter_objects(filename, min_size=32):
    """
    [DEPRECATED] Backward-compatible API, for applications using the old rtfobj:
    Open a RTF file, extract each embedded object encoded in hexadecimal of
    size > min_size, yield the index of the object in the RTF file, the original
    length in the RTF file, and the decoded object data in binary format.
    This is an iterator.

    :param filename: str, RTF file name/path to open on disk
    :param min_size: ignored, kept for backward compatibility
    :returns: iterator, yielding tuples (start index, original length, binary data)
    """
    data = open(filename, 'rb').read()
    rtfp = RtfObjParser(data)
    rtfp.parse()
    for obj in rtfp.objects:
        orig_len = obj.end - obj.start
        yield obj.start, orig_len, obj.rawdata


def is_rtf(arg, treat_str_as_data=False):
    """ determine whether given file / stream / array represents an rtf file

    arg can be either a file name, a byte stream (located at start), a
    list/tuple or a an iterable that contains bytes.

    For str it is not clear whether data is a file name or the data read from
    it (at least for py2-str which is bytes). Argument treat_str_as_data
    clarifies.
    """
    magic_len = len(RTF_MAGIC)
    if isinstance(arg, UNICODE_TYPE):
        with open(arg, 'rb') as reader:
            return reader.read(len(RTF_MAGIC)) == RTF_MAGIC
    if isinstance(arg, bytes) and not isinstance(arg, str):  # only in PY3
        return arg[:magic_len] == RTF_MAGIC
    if isinstance(arg, bytearray):
        return arg[:magic_len] == RTF_MAGIC
    if isinstance(arg, str):      # could be bytes, but we assume file name
        if treat_str_as_data:
            try:
                return arg[:magic_len].encode('ascii', errors='strict')\
                    == RTF_MAGIC
            except UnicodeError:
                return False
        else:
            with open(arg, 'rb') as reader:
                return reader.read(len(RTF_MAGIC)) == RTF_MAGIC
    if hasattr(arg, 'read'):      # a stream (i.e. file-like object)
        return arg.read(len(RTF_MAGIC)) == RTF_MAGIC
    if isinstance(arg, (list, tuple)):
        iter_arg = iter(arg)
    else:
        iter_arg = arg

    # check iterable
    for magic_byte in zip(RTF_MAGIC):
        try:
            if next(iter_arg) not in magic_byte:
                return False
        except StopIteration:
            return False

    return True  # checked the complete magic without returning False --> match


def sanitize_filename(filename, replacement='_', max_length=200):
    """compute basename of filename. Replaces all non-whitelisted characters.
       The returned filename is always a basename of the file."""
    basepath = os.path.basename(filename).strip()
    sane_fname = re.sub(r'[^\w\.\- ]', replacement, basepath)

    while ".." in sane_fname:
        sane_fname = sane_fname.replace('..', '.')

    while "  " in sane_fname:
        sane_fname = sane_fname.replace('  ', ' ')

    if not len(filename):
        sane_fname = 'NONAME'

    # limit filename length
    if max_length:
        sane_fname = sane_fname[:max_length]

    return sane_fname


def process_file(container, filename, data, output_dir=None, save_object=False):
    if output_dir:
        if not os.path.isdir(output_dir):
            log.info('creating output directory %s' % output_dir)
            os.mkdir(output_dir)

        fname_prefix = os.path.join(output_dir,
                                    sanitize_filename(filename))
    else:
        base_dir = os.path.dirname(filename)
        sane_fname = sanitize_filename(filename)
        fname_prefix = os.path.join(base_dir, sane_fname)

    # TODO: option to extract objects to files (false by default)
    if data is None:
        data = open(filename, 'rb').read()
    print('='*79)
    print('File: %r - size: %d bytes' % (filename, len(data)))
    tstream = tablestream.TableStream(
        column_width=(3, 10, 63),
        header_row=('id', 'index', 'OLE Object'),
        style=tablestream.TableStyleSlim
    )
    rtfp = RtfObjParser(data)
    rtfp.parse()
    for rtfobj in rtfp.objects:
        ole_color = None
        if rtfobj.is_ole:
            ole_column = 'format_id: %d ' % rtfobj.format_id
            if rtfobj.format_id == oleobj.OleObject.TYPE_EMBEDDED:
                ole_column += '(Embedded)\n'
            elif rtfobj.format_id == oleobj.OleObject.TYPE_LINKED:
                ole_column += '(Linked)\n'
            else:
                ole_column += '(Unknown)\n'
            ole_column += 'class name: %r\n' % rtfobj.class_name
            # if the object is linked and not embedded, data_size=None:
            if rtfobj.oledata_size is None:
                ole_column += 'data size: N/A'
            else:
                ole_column += 'data size: %d' % rtfobj.oledata_size
            if rtfobj.is_package:
                ole_column += '\nOLE Package object:'
                ole_column += '\nFilename: %r' % rtfobj.filename
                ole_column += '\nSource path: %r' % rtfobj.src_path
                ole_column += '\nTemp path = %r' % rtfobj.temp_path
                ole_column += '\nMD5 = %r' % rtfobj.olepkgdata_md5
                ole_color = 'yellow'
                # check if the file extension is executable:

                _, temp_ext = os.path.splitext(rtfobj.temp_path)
                log.debug('Temp path extension: %r' % temp_ext)
                _, file_ext = os.path.splitext(rtfobj.filename)
                log.debug('File extension: %r' % file_ext)

                if temp_ext != file_ext:
                    ole_column += "\nMODIFIED FILE EXTENSION"

                if re_executable_extensions.match(temp_ext) or re_executable_extensions.match(file_ext):
                    ole_color = 'red'
                    ole_column += '\nEXECUTABLE FILE'
                ole_column += '\nFile Type: {}'.format(rtfobj.ftg.ftype.name)
            else:
                ole_column += '\nMD5 = %r' % rtfobj.oledata_md5
            if rtfobj.clsid is not None:
                ole_column += '\nCLSID: %s' % rtfobj.clsid
                ole_column += '\n%s' % rtfobj.clsid_desc
                if 'CVE' in rtfobj.clsid_desc:
                    ole_color = 'red'
            # Detect OLE2Link exploit
            # http://www.kb.cert.org/vuls/id/921560
            if rtfobj.class_name == b'OLE2Link':
                ole_color = 'red'
                ole_column += '\nPossibly an exploit for the OLE2Link vulnerability (VU#921560, CVE-2017-0199)\n'
                # https://bitbucket.org/snippets/Alexander_Hanel/7Adpp
                urls = []
                # We look for unicode strings of 3+ chars in the OLE object data:
                # Here the regex must be a bytes string (issue #692)
                # but Python 2.7 does not support rb'...' so we use b'...' and escape backslashes
                pat = re.compile(b'(?:[\\x20-\\x7E][\\x00]){3,}')
                words = [w.decode('utf-16le') for w in pat.findall(rtfobj.oledata)]
                for w in words:
                    # TODO: we could use the URL_RE regex from olevba to be more precise
                    if "http" in w:
                        urls.append(w)
                urls = sorted(set(urls))
                if urls:
                    ole_column += 'URL extracted: ' + ', '.join(urls)
            # Detect Equation Editor exploit
            # https://www.kb.cert.org/vuls/id/421280/
            elif rtfobj.class_name.lower().startswith(b'equation.3'):
                ole_color = 'red'
                ole_column += '\nPossibly an exploit for the Equation Editor vulnerability (VU#421280, CVE-2017-11882)'
        else:
            ole_column = 'Not a well-formed OLE object'
        tstream.write_row((
            rtfp.objects.index(rtfobj),
            # filename,
            '%08Xh' % rtfobj.start,
            ole_column
            ), colors=(None, None, ole_color)
        )
        tstream.write_sep()
    if save_object:
        if save_object == 'all':
            objects = rtfp.objects
        else:
            try:
                i = int(save_object)
                objects = [ rtfp.objects[i] ]
            except:
                log.error('The -s option must be followed by an object index or all, such as "-s 2" or "-s all"')
                return
        for rtfobj in objects:
            i = objects.index(rtfobj)
            if rtfobj.is_package:
                print('Saving file from OLE Package in object #%d:' % i)
                print('  Filename = %r' % rtfobj.filename)
                print('  Source path = %r' % rtfobj.src_path)
                print('  Temp path = %r' % rtfobj.temp_path)
                if rtfobj.filename:
                    fname = '%s_%s' % (fname_prefix,
                                       sanitize_filename(rtfobj.filename))
                else:
                    fname = '%s_object_%08X.noname' % (fname_prefix, rtfobj.start)
                print('  saving to file %s' % fname)
                print('  md5 %s' % rtfobj.olepkgdata_md5)
                open(fname, 'wb').write(rtfobj.olepkgdata)
            # When format_id=TYPE_LINKED, oledata_size=None
            elif rtfobj.is_ole and rtfobj.oledata_size is not None:
                print('Saving file embedded in OLE object #%d:' % i)
                print('  format_id  = %d' % rtfobj.format_id)
                print('  class name = %r' % rtfobj.class_name)
                print('  data size  = %d' % rtfobj.oledata_size)
                # set a file extension according to the class name:
                class_name = rtfobj.class_name.lower()
                if class_name.startswith(b'word'):
                    ext = 'doc'
                elif class_name.startswith(b'package'):
                    ext = 'package'
                else:
                    ext = 'bin'
                fname = '%s_object_%08X.%s' % (fname_prefix, rtfobj.start, ext)
                print('  saving to file %s' % fname)
                print('  md5 %s' % rtfobj.oledata_md5)
                open(fname, 'wb').write(rtfobj.oledata)
            else:
                print('Saving raw data in object #%d:' % i)
                fname = '%s_object_%08X.raw' % (fname_prefix, rtfobj.start)
                print('  saving object to file %s' % fname)
                print('  md5 %s' % rtfobj.rawdata_md5)
                open(fname, 'wb').write(rtfobj.rawdata)


#=== MAIN =================================================================

def main():
    # print banner with version
    python_version = '%d.%d.%d' % sys.version_info[0:3]
    print ('rtfobj %s on Python %s - http://decalage.info/python/oletools' %
           (__version__, python_version))
    print ('THIS IS WORK IN PROGRESS - Check updates regularly!')
    print ('Please report any issue at https://github.com/decalage2/oletools/issues')
    print ('')

    DEFAULT_LOG_LEVEL = "warning" # Default log level
    LOG_LEVELS = {
        'debug':    logging.DEBUG,
        'info':     logging.INFO,
        'warning':  logging.WARNING,
        'error':    logging.ERROR,
        'critical': logging.CRITICAL
        }

    usage = 'usage: %prog [options] <filename> [filename2 ...]'
    parser = optparse.OptionParser(usage=usage)
    # parser.add_option('-o', '--outfile', dest='outfile',
    #     help='output file')
    # parser.add_option('-c', '--csv', dest='csv',
    #     help='export results to a CSV file')
    parser.add_option("-r", action="store_true", dest="recursive",
        help='find files recursively in subdirectories.')
    parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None,
        help='if the file is a zip archive, open first file from it, using the provided password (requires Python 2.6+)')
    parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*',
        help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)')
    parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL,
                            help="logging level debug/info/warning/error/critical (default=%default)")
    parser.add_option("-s", "--save", dest='save_object', type='str', default=None,
        help='Save the object corresponding to the provided number to a file, for example "-s 2". Use "-s all" to save all objects at once.')
    # parser.add_option("-o", "--outfile", dest='outfile', type='str', default=None,
    #     help='Filename to be used when saving an object to a file.')
    parser.add_option("-d", type="str", dest="output_dir",
        help='use specified directory to save output files.', default=None)
    # parser.add_option("--pkg", action="store_true", dest="save_pkg",
    #     help='Save OLE Package binary data of extracted objects (file embedded into an OLE Package).')
    # parser.add_option("--ole", action="store_true", dest="save_ole",
    #     help='Save OLE binary data of extracted objects (object data without the OLE container).')
    # parser.add_option("--raw", action="store_true", dest="save_raw",
    #     help='Save raw binary data of extracted objects (decoded from hex, including the OLE container).')
    # parser.add_option("--hex", action="store_true", dest="save_hex",
    #     help='Save raw hexadecimal data of extracted objects (including the OLE container).')


    (options, args) = parser.parse_args()

    # Print help if no arguments are passed
    if len(args) == 0:
        print (__doc__)
        parser.print_help()
        sys.exit()

    # Setup logging to the console:
    # here we use stdout instead of stderr by default, so that the output
    # can be redirected properly.
    logging.basicConfig(level=LOG_LEVELS[options.loglevel], stream=sys.stdout,
                        format='%(levelname)-8s %(message)s')
    # enable logging in the modules:
    log.setLevel(logging.NOTSET)
    oleobj.enable_logging()

    for container, filename, data in xglob.iter_files(args, recursive=options.recursive,
        zip_password=options.zip_password, zip_fname=options.zip_fname):
        # ignore directory names stored in zip files:
        if container and filename.endswith('/'):
            continue
        process_file(container, filename, data, output_dir=options.output_dir,
                     save_object=options.save_object)


if __name__ == '__main__':
    main()

# This code was developed while listening to The Mary Onettes "Lost"