Source code for excelrd.book

# Copyright (c) 2005-2012 Stephen John Machin, Lingfo Pty Ltd
# This module is part of the excelrd package, which is released under a
# BSD-style licence.


import gc
import struct
import sys
from time import perf_counter

from . import compdoc, formatting, sheet
from .biffh import *
from .formula import *
from .timemachine import *


unpack = struct.unpack

empty_cell = sheet.empty_cell  # for exposure to the world ...

DEBUG = 0

USE_FANCY_CD = 1

TOGGLE_GC = 0
# gc.set_debug(gc.DEBUG_STATS)

try:
    import mmap

    MMAP_AVAILABLE = 1
except ImportError:
    MMAP_AVAILABLE = 0
USE_MMAP = MMAP_AVAILABLE

MY_EOF = 0xF00BAAA  # not a 16-bit number

SUPBOOK_UNK, SUPBOOK_INTERNAL, SUPBOOK_EXTERNAL, SUPBOOK_ADDIN, SUPBOOK_DDEOLE = list(range(5))

SUPPORTED_VERSIONS = (80, 70, 50, 45, 40, 30, 21, 20)

_code_from_builtin_name = {
    "Consolidate_Area": "\x00",
    "Auto_Open": "\x01",
    "Auto_Close": "\x02",
    "Extract": "\x03",
    "Database": "\x04",
    "Criteria": "\x05",
    "Print_Area": "\x06",
    "Print_Titles": "\x07",
    "Recorder": "\x08",
    "Data_Form": "\x09",
    "Auto_Activate": "\x0A",
    "Auto_Deactivate": "\x0B",
    "Sheet_Title": "\x0C",
    "_FilterDatabase": "\x0D",
}
builtin_name_from_code = {}
code_from_builtin_name = {}
for _bin, _bic in _code_from_builtin_name.items():
    _bin = _bin
    _bic = _bic
    code_from_builtin_name[_bin] = _bic
    builtin_name_from_code[_bic] = _bin
del _bin, _bic, _code_from_builtin_name


def open_workbook_xls(
    filename=None,
    logfile=sys.stdout,
    verbosity=0,
    use_mmap=USE_MMAP,
    file_contents=None,
    encoding_override=None,
    formatting_info=False,
    on_demand=False,
    ragged_rows=False,
    ignore_workbook_corruption=False,
):
    t0 = perf_counter()
    if TOGGLE_GC:
        orig_gc_enabled = gc.isenabled()
        if orig_gc_enabled:
            gc.disable()
    bk = Book()
    try:
        bk.biff2_8_load(
            filename=filename,
            file_contents=file_contents,
            logfile=logfile,
            verbosity=verbosity,
            use_mmap=use_mmap,
            encoding_override=encoding_override,
            formatting_info=formatting_info,
            on_demand=on_demand,
            ragged_rows=ragged_rows,
            ignore_workbook_corruption=ignore_workbook_corruption,
        )
        t1 = perf_counter()
        bk.load_time_stage_1 = t1 - t0
        biff_version = bk.getbof(XL_WORKBOOK_GLOBALS)
        if not biff_version:
            raise XLRDError("Can't determine file's BIFF version")
        if biff_version not in SUPPORTED_VERSIONS:
            raise XLRDError("BIFF version %s is not supported" % biff_text_from_num[biff_version])
        bk.biff_version = biff_version
        if biff_version <= 40:
            # no workbook globals, only 1 worksheet
            if on_demand:
                fprintf(
                    bk.logfile,
                    "*** WARNING: on_demand is not supported for this Excel version.\n"
                    "*** Setting on_demand to False.\n",
                )
                bk.on_demand = on_demand = False
            bk.fake_globals_get_sheet()
        elif biff_version == 45:
            # worksheet(s) embedded in global stream
            bk.parse_globals()
            if on_demand:
                fprintf(
                    bk.logfile,
                    "*** WARNING: on_demand is not supported for this Excel version.\n"
                    "*** Setting on_demand to False.\n",
                )
                bk.on_demand = on_demand = False
        else:
            bk.parse_globals()
            bk._sheet_list = [None for sh in bk._sheet_names]
            if not on_demand:
                bk.get_sheets()
        bk.nsheets = len(bk._sheet_list)
        if biff_version == 45 and bk.nsheets > 1:
            fprintf(
                bk.logfile,
                "*** WARNING: Excel 4.0 workbook (.XLW) file contains %d worksheets.\n"
                "*** Book-level data will be that of the last worksheet.\n",
                bk.nsheets,
            )
        if TOGGLE_GC:
            if orig_gc_enabled:
                gc.enable()
        t2 = perf_counter()
        bk.load_time_stage_2 = t2 - t1
    except:
        bk.release_resources()
        raise
    # normal exit
    if not on_demand:
        bk.release_resources()
    return bk


[docs]class Name(BaseObject): """ Information relating to a named reference, formula, macro, etc. .. note:: Name information is **not** extracted from files older than Excel 5.0 (``Book.biff_version < 50``) """ _repr_these = ["stack"] book = None # parent #: 0 = Visible; 1 = Hidden hidden = 0 #: 0 = Command macro; 1 = Function macro. Relevant only if macro == 1 func = 0 #: 0 = Sheet macro; 1 = VisualBasic macro. Relevant only if macro == 1 vbasic = 0 #: 0 = Standard name; 1 = Macro name macro = 0 #: 0 = Simple formula; 1 = Complex formula (array formula or user defined). #: #: .. note:: No examples have been sighted. complex = 0 #: 0 = User-defined name; 1 = Built-in name #: #: Common examples: ``Print_Area``, ``Print_Titles``; see OOo docs for #: full list builtin = 0 #: Function group. Relevant only if macro == 1; see OOo docs for values. funcgroup = 0 #: 0 = Formula definition; 1 = Binary data #: #: .. note:: No examples have been sighted. binary = 0 #: The index of this object in book.name_obj_list name_index = 0 # A Unicode string. If builtin, decoded as per OOo docs. name = "" #: An 8-bit string. raw_formula = b"" #: ``-1``: #: The name is global (visible in all calculation sheets). #: ``-2``: #: The name belongs to a macro sheet or VBA sheet. #: ``-3``: #: The name is invalid. #: ``0 <= scope < book.nsheets``: #: The name is local to the sheet whose index is scope. scope = -1 #: The result of evaluating the formula, if any. #: If no formula, or evaluation of the formula encountered problems, #: the result is ``None``. Otherwise the result is a single instance of the #: :class:`~excelrd.formula.Operand` class. # result = None
[docs] def cell(self): """ This is a convenience method for the frequent use case where the name refers to a single cell. :returns: An instance of the :class:`~excelrd.sheet.Cell` class. :raises excelrd.biffh.XLRDError: The name is not a constant absolute reference to a single cell. """ res = self.result if res: # result should be an instance of the Operand class kind = res.kind value = res.value if kind == oREF and len(value) == 1: ref3d = value[0] if ( 0 <= ref3d.shtxlo == ref3d.shtxhi - 1 and ref3d.rowxlo == ref3d.rowxhi - 1 and ref3d.colxlo == ref3d.colxhi - 1 ): sh = self.book.sheet_by_index(ref3d.shtxlo) return sh.cell(ref3d.rowxlo, ref3d.colxlo) self.dump( self.book.logfile, header="=== Dump of Name object ===", footer="======= End of dump =======", ) raise XLRDError("Not a constant absolute reference to a single cell")
[docs] def area2d(self, clipped=True): """ This is a convenience method for the use case where the name refers to one rectangular area in one worksheet. :param clipped: If ``True``, the default, the returned rectangle is clipped to fit in ``(0, sheet.nrows, 0, sheet.ncols)``. it is guaranteed that ``0 <= rowxlo <= rowxhi <= sheet.nrows`` and that the number of usable rows in the area (which may be zero) is ``rowxhi - rowxlo``; likewise for columns. :returns: a tuple ``(sheet_object, rowxlo, rowxhi, colxlo, colxhi)``. :raises excelrd.biffh.XLRDError: The name is not a constant absolute reference to a single area in a single sheet. """ res = self.result if res: # result should be an instance of the Operand class kind = res.kind value = res.value if kind == oREF and len(value) == 1: # only 1 reference ref3d = value[0] if 0 <= ref3d.shtxlo == ref3d.shtxhi - 1: # only 1 usable sheet sh = self.book.sheet_by_index(ref3d.shtxlo) if not clipped: return sh, ref3d.rowxlo, ref3d.rowxhi, ref3d.colxlo, ref3d.colxhi rowxlo = min(ref3d.rowxlo, sh.nrows) rowxhi = max(rowxlo, min(ref3d.rowxhi, sh.nrows)) colxlo = min(ref3d.colxlo, sh.ncols) colxhi = max(colxlo, min(ref3d.colxhi, sh.ncols)) assert 0 <= rowxlo <= rowxhi <= sh.nrows assert 0 <= colxlo <= colxhi <= sh.ncols return sh, rowxlo, rowxhi, colxlo, colxhi self.dump( self.book.logfile, header="=== Dump of Name object ===", footer="======= End of dump =======", ) raise XLRDError("Not a constant absolute reference to a single area in a single sheet")
[docs]class Book(BaseObject): """ Contents of a "workbook". .. warning:: You should not instantiate this class yourself. You use the :class:`Book` object that was returned when you called :func:`~excelrd.open_workbook`. """ #: The number of worksheets present in the workbook file. #: This information is available even when no sheets have yet been loaded. nsheets = 0 #: Which date system was in force when this file was last saved. #: #: 0: #: 1900 system (the Excel for Windows default). #: #: 1: #: 1904 system (the Excel for Macintosh default). #: #: Defaults to 0 in case it's not specified in the file. datemode = 0 #: Version of BIFF (Binary Interchange File Format) used to create the file. #: Latest is 8.0 (represented here as 80), introduced with Excel 97. #: Earliest supported by this module: 2.0 (represented as 20). biff_version = 0 #: List containing a :class:`Name` object for each ``NAME`` record in the #: workbook. #: #: .. versionadded:: 0.6.0 name_obj_list = [] #: An integer denoting the character set used for strings in this file. #: For BIFF 8 and later, this will be 1200, meaning Unicode; #: more precisely, UTF_16_LE. #: For earlier versions, this is used to derive the appropriate Python #: encoding to be used to convert to Unicode. #: Examples: ``1252 -> 'cp1252'``, ``10000 -> 'mac_roman'`` codepage = None #: The encoding that was derived from the codepage. encoding = None #: A tuple containing the telephone country code for: #: #: ``[0]``: #: the user-interface setting when the file was created. #: #: ``[1]``: #: the regional settings. #: #: Example: ``(1, 61)`` meaning ``(USA, Australia)``. #: #: This information may give a clue to the correct encoding for an #: unknown codepage. For a long list of observed values, refer to the #: OpenOffice.org documentation for the ``COUNTRY`` record. countries = (0, 0) #: What (if anything) is recorded as the name of the last user to #: save the file. user_name = "" #: A list of :class:`~excelrd.formatting.Font` class instances, #: each corresponding to a FONT record. #: #: .. versionadded:: 0.6.1 font_list = [] #: A list of :class:`~excelrd.formatting.XF` class instances, #: each corresponding to an ``XF`` record. #: #: .. versionadded:: 0.6.1 xf_list = [] #: A list of :class:`~excelrd.formatting.Format` objects, each corresponding to #: a ``FORMAT`` record, in the order that they appear in the input file. #: It does *not* contain builtin formats. #: #: If you are creating an output file using (for example) :mod:`xlwt`, #: use this list. #: #: The collection to be used for all visual rendering purposes is #: :attr:`format_map`. #: #: .. versionadded:: 0.6.1 format_list = [] ## #: The mapping from :attr:`~excelrd.formatting.XF.format_key` to #: :class:`~excelrd.formatting.Format` object. #: #: .. versionadded:: 0.6.1 format_map = {} #: This provides access via name to the extended format information for #: both built-in styles and user-defined styles. #: #: It maps ``name`` to ``(built_in, xf_index)``, where #: ``name`` is either the name of a user-defined style, #: or the name of one of the built-in styles. Known built-in names are #: Normal, RowLevel_1 to RowLevel_7, #: ColLevel_1 to ColLevel_7, Comma, Currency, Percent, "Comma [0]", #: "Currency [0]", Hyperlink, and "Followed Hyperlink". #: #: ``built_in`` has the following meanings #: #: 1: #: built-in style #: #: 0: #: user-defined #: #: ``xf_index`` is an index into :attr:`Book.xf_list`. #: #: References: OOo docs s6.99 (``STYLE`` record); Excel UI Format/Style #: #: .. versionadded:: 0.6.1 #: #: Extracted only if ``open_workbook(..., formatting_info=True)`` #: #: .. versionadded:: 0.7.4 style_name_map = {} #: This provides definitions for colour indexes. Please refer to #: :ref:`palette` for an explanation #: of how colours are represented in Excel. #: #: Colour indexes into the palette map into ``(red, green, blue)`` tuples. #: "Magic" indexes e.g. ``0x7FFF`` map to ``None``. #: #: :attr:`colour_map` is what you need if you want to render cells on screen #: or in a PDF file. If you are writing an output XLS file, use #: :attr:`palette_record`. #: #: .. note:: Extracted only if ``open_workbook(..., formatting_info=True)`` #: #: .. versionadded:: 0.6.1 colour_map = {} #: If the user has changed any of the colours in the standard palette, the #: XLS file will contain a ``PALETTE`` record with 56 (16 for Excel 4.0 and #: earlier) RGB values in it, and this list will be e.g. #: ``[(r0, b0, g0), ..., (r55, b55, g55)]``. #: Otherwise this list will be empty. This is what you need if you are #: writing an output XLS file. If you want to render cells on screen or in a #: PDF file, use :attr:`colour_map`. #: #: .. note:: Extracted only if ``open_workbook(..., formatting_info=True)`` #: #: .. versionadded:: 0.6.1 palette_record = [] #: Time in seconds to extract the XLS image as a contiguous string #: (or mmap equivalent). load_time_stage_1 = -1.0 #: Time in seconds to parse the data from the contiguous string #: (or mmap equivalent). load_time_stage_2 = -1.0
[docs] def sheets(self): """ :returns: A list of all sheets in the book. All sheets not already loaded will be loaded. """ for sheetx in range(self.nsheets): if not self._sheet_list[sheetx]: self.get_sheet(sheetx) return self._sheet_list[:]
[docs] def sheet_by_index(self, sheetx): """ :param sheetx: Sheet index in ``range(nsheets)`` :returns: A :class:`~excelrd.sheet.Sheet`. """ return self._sheet_list[sheetx] or self.get_sheet(sheetx)
def __iter__(self): """ Makes iteration through sheets of a book a little more straightforward. Don't free resources after use since it can be called like `list(book)` """ for i in range(self.nsheets): yield self.sheet_by_index(i)
[docs] def sheet_by_name(self, sheet_name): """ :param sheet_name: Name of the sheet required. :returns: A :class:`~excelrd.sheet.Sheet`. """ try: sheetx = self._sheet_names.index(sheet_name) except ValueError: raise XLRDError("No sheet named <%r>" % sheet_name) return self.sheet_by_index(sheetx)
def __getitem__(self, item): """ Allow indexing with sheet name or index. :param item: Name or index of sheet enquired upon :return: :class:`~excelrd.sheet.Sheet`. """ if isinstance(item, int): return self.sheet_by_index(item) else: return self.sheet_by_name(item)
[docs] def sheet_names(self): """ :returns: A list of the names of all the worksheets in the workbook file. This information is available even when no sheets have yet been loaded. """ return self._sheet_names[:]
[docs] def sheet_loaded(self, sheet_name_or_index): """ :param sheet_name_or_index: Name or index of sheet enquired upon :returns: ``True`` if sheet is loaded, ``False`` otherwise. .. versionadded:: 0.7.1 """ if isinstance(sheet_name_or_index, int): sheetx = sheet_name_or_index else: try: sheetx = self._sheet_names.index(sheet_name_or_index) except ValueError: raise XLRDError("No sheet named <%r>" % sheet_name_or_index) return bool(self._sheet_list[sheetx])
[docs] def unload_sheet(self, sheet_name_or_index): """ :param sheet_name_or_index: Name or index of sheet to be unloaded. .. versionadded:: 0.7.1 """ if isinstance(sheet_name_or_index, int): sheetx = sheet_name_or_index else: try: sheetx = self._sheet_names.index(sheet_name_or_index) except ValueError: raise XLRDError("No sheet named <%r>" % sheet_name_or_index) self._sheet_list[sheetx] = None
[docs] def release_resources(self): """ This method has a dual purpose. You can call it to release memory-consuming objects and (possibly) a memory-mapped file (:class:`mmap.mmap` object) when you have finished loading sheets in ``on_demand`` mode, but still require the :class:`Book` object to examine the loaded sheets. It is also called automatically (a) when :func:`~excelrd.open_workbook` raises an exception and (b) if you are using a ``with`` statement, when the ``with`` block is exited. Calling this method multiple times on the same object has no ill effect. """ self._resources_released = 1 if hasattr(self.mem, "close"): # must be a mmap.mmap object self.mem.close() self.mem = None if hasattr(self.filestr, "close"): self.filestr.close() self.filestr = None self._sharedstrings = None self._rich_text_runlist_map = None
def __enter__(self): return self def __exit__(self, exc_type, exc_value, exc_tb): self.release_resources() # return false #: A mapping from ``(lower_case_name, scope)`` to a single :class:`Name` #: object. #: #: .. versionadded:: 0.6.0 name_and_scope_map = {} #: A mapping from `lower_case_name` to a list of :class:`Name` objects. #: The list is sorted in scope order. Typically there will be one item #: (of global scope) in the list. #: #: .. versionadded:: 0.6.0 name_map = {} def __init__(self): self._sheet_list = [] self._sheet_names = [] self._sheet_visibility = [] # from BOUNDSHEET record self.nsheets = 0 self._sh_abs_posn = [] # sheet's absolute position in the stream self._sharedstrings = [] self._rich_text_runlist_map = {} self.raw_user_name = False self._sheethdr_count = 0 # BIFF 4W only self.builtinfmtcount = -1 # unknown as yet. BIFF 3, 4S, 4W self.initialise_format_info() self._all_sheets_count = 0 # includes macro & VBA sheets self._supbook_count = 0 self._supbook_locals_inx = None self._supbook_addins_inx = None self._all_sheets_map = [] # maps an all_sheets index to a calc-sheets index (or -1) self._externsheet_info = [] self._externsheet_type_b57 = [] self._extnsht_name_from_num = {} self._sheet_num_from_name = {} self._extnsht_count = 0 self._supbook_types = [] self._resources_released = 0 self.addin_func_names = [] self.name_obj_list = [] self.colour_map = {} self.palette_record = [] self.xf_list = [] self.style_name_map = {} self.mem = b"" self.filestr = b"" def biff2_8_load( self, filename=None, file_contents=None, logfile=sys.stdout, verbosity=0, use_mmap=USE_MMAP, encoding_override=None, formatting_info=False, on_demand=False, ragged_rows=False, ignore_workbook_corruption=False, ): # DEBUG = 0 self.logfile = logfile self.verbosity = verbosity self.use_mmap = use_mmap and MMAP_AVAILABLE self.encoding_override = encoding_override self.formatting_info = formatting_info self.on_demand = on_demand self.ragged_rows = ragged_rows if not file_contents: with open(filename, "rb") as f: f.seek(0, 2) # EOF size = f.tell() f.seek(0, 0) # BOF if size == 0: raise XLRDError("File size is 0 bytes") if self.use_mmap: self.filestr = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ) self.stream_len = size else: self.filestr = f.read() self.stream_len = len(self.filestr) else: self.filestr = file_contents self.stream_len = len(file_contents) self.base = 0 if self.filestr[:8] != compdoc.SIGNATURE: # got this one at the antique store self.mem = self.filestr else: cd = compdoc.CompDoc( self.filestr, logfile=self.logfile, ignore_workbook_corruption=ignore_workbook_corruption, ) if USE_FANCY_CD: for qname in ["Workbook", "Book"]: self.mem, self.base, self.stream_len = cd.locate_named_stream(qname) if self.mem: break else: raise XLRDError("Can't find workbook in OLE2 compound document") else: for qname in ["Workbook", "Book"]: self.mem = cd.get_named_stream(qname) if self.mem: break else: raise XLRDError("Can't find workbook in OLE2 compound document") self.stream_len = len(self.mem) del cd if self.mem is not self.filestr: if hasattr(self.filestr, "close"): self.filestr.close() self.filestr = b"" self._position = self.base if DEBUG: print( "mem: %s, base: %d, len: %d" % (type(self.mem), self.base, self.stream_len), file=self.logfile, ) def initialise_format_info(self): # needs to be done once per sheet for BIFF 4W :-( self.format_map = {} self.format_list = [] self.xfcount = 0 self.actualfmtcount = 0 # number of FORMAT records seen so far self._xf_index_to_xl_type_map = {0: XL_CELL_NUMBER} self._xf_epilogue_done = 0 self.xf_list = [] self.font_list = [] def get2bytes(self): pos = self._position buff_two = self.mem[pos : pos + 2] lenbuff = len(buff_two) self._position += lenbuff if lenbuff < 2: return MY_EOF lo, hi = buff_two return (BYTES_ORD(hi) << 8) | BYTES_ORD(lo) def get_record_parts(self): pos = self._position mem = self.mem code, length = unpack("<HH", mem[pos : pos + 4]) pos += 4 data = mem[pos : pos + length] self._position = pos + length return (code, length, data) def get_record_parts_conditional(self, reqd_record): pos = self._position mem = self.mem code, length = unpack("<HH", mem[pos : pos + 4]) if code != reqd_record: return (None, 0, b"") pos += 4 data = mem[pos : pos + length] self._position = pos + length return (code, length, data) def get_sheet(self, sh_number, update_pos=True): if self._resources_released: raise XLRDError("Can't load sheets after releasing resources.") if update_pos: self._position = self._sh_abs_posn[sh_number] self.getbof(XL_WORKSHEET) # assert biff_version == self.biff_version ### FAILS # Have an example where book is v7 but sheet reports v8!!! # It appears to work OK if the sheet version is ignored. # Confirmed by Daniel Rentz: happens when Excel does "save as" # creating an old version file; ignore version details on sheet BOF. sh = sheet.Sheet( self, self._position, self._sheet_names[sh_number], sh_number, ) sh.read(self) self._sheet_list[sh_number] = sh return sh def get_sheets(self): # DEBUG = 0 if DEBUG: print("GET_SHEETS:", self._sheet_names, self._sh_abs_posn, file=self.logfile) for sheetno in range(len(self._sheet_names)): if DEBUG: print( "GET_SHEETS: sheetno =", sheetno, self._sheet_names, self._sh_abs_posn, file=self.logfile, ) self.get_sheet(sheetno) def fake_globals_get_sheet(self): # for BIFF 4.0 and earlier formatting.initialise_book(self) fake_sheet_name = "Sheet 1" self._sheet_names = [fake_sheet_name] self._sh_abs_posn = [0] self._sheet_visibility = [0] # one sheet, visible self._sheet_list.append(None) # get_sheet updates _sheet_list but needs a None beforehand self.get_sheets() def handle_boundsheet(self, data): # DEBUG = 1 bv = self.biff_version self.derive_encoding() if DEBUG: fprintf(self.logfile, "BOUNDSHEET: bv=%d data %r\n", bv, data) if bv == 45: # BIFF4W #### Not documented in OOo docs ... # In fact, the *only* data is the name of the sheet. sheet_name = unpack_string(data, 0, self.encoding, lenlen=1) visibility = 0 sheet_type = XL_BOUNDSHEET_WORKSHEET # guess, patch later if len(self._sh_abs_posn) == 0: abs_posn = self._sheetsoffset + self.base # Note (a) this won't be used # (b) it's the position of the SHEETHDR record # (c) add 11 to get to the worksheet BOF record else: abs_posn = -1 # unknown else: offset, visibility, sheet_type = unpack("<iBB", data[0:6]) abs_posn = offset + self.base # because global BOF is always at posn 0 in the stream if bv < BIFF_FIRST_UNICODE: sheet_name = unpack_string(data, 6, self.encoding, lenlen=1) else: sheet_name = unpack_unicode(data, 6, lenlen=1) if DEBUG or self.verbosity >= 2: fprintf( self.logfile, "BOUNDSHEET: inx=%d vis=%r sheet_name=%r abs_posn=%d sheet_type=0x%02x\n", self._all_sheets_count, visibility, sheet_name, abs_posn, sheet_type, ) self._all_sheets_count += 1 if sheet_type != XL_BOUNDSHEET_WORKSHEET: self._all_sheets_map.append(-1) descr = { 1: "Macro sheet", 2: "Chart", 6: "Visual Basic module", }.get(sheet_type, "UNKNOWN") if DEBUG or self.verbosity >= 1: fprintf( self.logfile, "NOTE *** Ignoring non-worksheet data named %r (type 0x%02x = %s)\n", sheet_name, sheet_type, descr, ) else: snum = len(self._sheet_names) self._all_sheets_map.append(snum) self._sheet_names.append(sheet_name) self._sh_abs_posn.append(abs_posn) self._sheet_visibility.append(visibility) self._sheet_num_from_name[sheet_name] = snum def handle_builtinfmtcount(self, data): ### N.B. This count appears to be utterly useless. # DEBUG = 1 builtinfmtcount = unpack("<H", data[0:2])[0] if DEBUG: fprintf(self.logfile, "BUILTINFMTCOUNT: %r\n", builtinfmtcount) self.builtinfmtcount = builtinfmtcount def derive_encoding(self): if self.encoding_override: self.encoding = self.encoding_override elif self.codepage is None: if self.biff_version < 80: fprintf( self.logfile, "*** No CODEPAGE record, no encoding_override: will use 'iso-8859-1'\n", ) self.encoding = "iso-8859-1" else: self.codepage = 1200 # utf16le if self.verbosity >= 2: fprintf(self.logfile, "*** No CODEPAGE record; assuming 1200 (utf_16_le)\n") else: codepage = self.codepage if codepage in encoding_from_codepage: encoding = encoding_from_codepage[codepage] elif 300 <= codepage <= 1999: encoding = "cp" + str(codepage) elif self.biff_version >= 80: self.codepage = 1200 encoding = "utf_16_le" else: encoding = "unknown_codepage_" + str(codepage) if DEBUG or (self.verbosity and encoding != self.encoding): fprintf(self.logfile, "CODEPAGE: codepage %r -> encoding %r\n", codepage, encoding) self.encoding = encoding if self.codepage != 1200: # utf_16_le # If we don't have a codec that can decode ASCII into Unicode, # we're well & truly stuffed -- let the punter know ASAP. try: str(b"trial", self.encoding) except BaseException as e: fprintf( self.logfile, "ERROR *** codepage %r -> encoding %r -> %s: %s\n", self.codepage, self.encoding, type(e).__name__.split(".")[-1], e, ) raise if self.raw_user_name: strg = unpack_string(self.user_name, 0, self.encoding, lenlen=1) strg = strg.rstrip() # if DEBUG: # print "CODEPAGE: user name decoded from %r to %r" % (self.user_name, strg) self.user_name = strg self.raw_user_name = False return self.encoding def handle_codepage(self, data): # DEBUG = 0 codepage = unpack("<H", data[0:2])[0] self.codepage = codepage self.derive_encoding() def handle_country(self, data): countries = unpack("<HH", data[0:4]) if self.verbosity: print("Countries:", countries, file=self.logfile) # Note: in BIFF7 and earlier, country record was put (redundantly?) in each worksheet. assert self.countries == (0, 0) or self.countries == countries self.countries = countries def handle_datemode(self, data): datemode = unpack("<H", data[0:2])[0] if DEBUG or self.verbosity: fprintf(self.logfile, "DATEMODE: datemode %r\n", datemode) assert datemode in (0, 1) self.datemode = datemode def handle_externname(self, data): blah = DEBUG or self.verbosity >= 2 if self.biff_version >= 80: option_flags, other_info = unpack("<HI", data[:6]) pos = 6 name, pos = unpack_unicode_update_pos(data, pos, lenlen=1) extra = data[pos:] if self._supbook_types[-1] == SUPBOOK_ADDIN: self.addin_func_names.append(name) if blah: fprintf( self.logfile, "EXTERNNAME: sbktype=%d oflags=0x%04x oinfo=0x%08x name=%r extra=%r\n", self._supbook_types[-1], option_flags, other_info, name, extra, ) def handle_externsheet(self, data): self.derive_encoding() # in case CODEPAGE record missing/out of order/wrong self._extnsht_count += 1 # for use as a 1-based index blah1 = DEBUG or self.verbosity >= 1 blah2 = DEBUG or self.verbosity >= 2 if self.biff_version >= 80: num_refs = unpack("<H", data[0:2])[0] bytes_reqd = num_refs * 6 + 2 while len(data) < bytes_reqd: if blah1: fprintf( self.logfile, "INFO: EXTERNSHEET needs %d bytes, have %d\n", bytes_reqd, len(data), ) code2, length2, data2 = self.get_record_parts() if code2 != XL_CONTINUE: raise XLRDError("Missing CONTINUE after EXTERNSHEET record") data += data2 pos = 2 for k in range(num_refs): info = unpack("<HHH", data[pos : pos + 6]) ref_recordx, ref_first_sheetx, ref_last_sheetx = info self._externsheet_info.append(info) pos += 6 if blah2: fprintf( self.logfile, "EXTERNSHEET(b8): k = %2d, record = %2d, first_sheet = %5d, last sheet = %5d\n", k, ref_recordx, ref_first_sheetx, ref_last_sheetx, ) else: nc, ty = unpack("<BB", data[:2]) if blah2: print("EXTERNSHEET(b7-):", file=self.logfile) hex_char_dump(data, 0, len(data), fout=self.logfile) msg = { 1: "Encoded URL", 2: "Current sheet!!", 3: "Specific sheet in own doc't", 4: "Nonspecific sheet in own doc't!!", }.get(ty, "Not encoded") print(" %3d chars, type is %d (%s)" % (nc, ty, msg), file=self.logfile) if ty == 3: sheet_name = str(data[2 : nc + 2], self.encoding) self._extnsht_name_from_num[self._extnsht_count] = sheet_name if blah2: print(self._extnsht_name_from_num, file=self.logfile) if not (1 <= ty <= 4): ty = 0 self._externsheet_type_b57.append(ty) def handle_filepass(self, data): if self.verbosity >= 2: logf = self.logfile fprintf(logf, "FILEPASS:\n") hex_char_dump(data, 0, len(data), base=0, fout=logf) if self.biff_version >= 80: (kind1,) = unpack("<H", data[:2]) if kind1 == 0: # weak XOR encryption key, hash_value = unpack("<HH", data[2:]) fprintf(logf, "weak XOR: key=0x%04x hash=0x%04x\n", key, hash_value) elif kind1 == 1: (kind2,) = unpack("<H", data[4:6]) if kind2 == 1: # BIFF8 standard encryption caption = "BIFF8 std" elif kind2 == 2: caption = "BIFF8 strong" else: caption = "** UNKNOWN ENCRYPTION METHOD **" fprintf(logf, "%s\n", caption) raise XLRDError("Workbook is encrypted") def handle_name(self, data): blah = DEBUG or self.verbosity >= 2 bv = self.biff_version if bv < 50: return self.derive_encoding() # print # hex_char_dump(data, 0, len(data), fout=self.logfile) ( option_flags, kb_shortcut, name_len, fmla_len, extsht_index, sheet_index, menu_text_len, description_text_len, help_topic_text_len, status_bar_text_len, ) = unpack("<HBBHHH4B", data[0:14]) nobj = Name() nobj.book = self ### CIRCULAR ### name_index = len(self.name_obj_list) nobj.name_index = name_index self.name_obj_list.append(nobj) nobj.option_flags = option_flags attrs = [ ("hidden", 1, 0), ("func", 2, 1), ("vbasic", 4, 2), ("macro", 8, 3), ("complex", 0x10, 4), ("builtin", 0x20, 5), ("funcgroup", 0xFC0, 6), ("binary", 0x1000, 12), ] for attr, mask, nshift in attrs: setattr(nobj, attr, (option_flags & mask) >> nshift) macro_flag = " M"[nobj.macro] if bv < 80: internal_name, pos = unpack_string_update_pos( data, 14, self.encoding, known_len=name_len ) else: internal_name, pos = unpack_unicode_update_pos(data, 14, known_len=name_len) nobj.extn_sheet_num = extsht_index nobj.excel_sheet_index = sheet_index nobj.scope = None # patched up in the names_epilogue() method if blah: fprintf( self.logfile, "NAME[%d]:%s oflags=%d, name_len=%d, fmla_len=%d, extsht_index=%d, sheet_index=%d, name=%r\n", name_index, macro_flag, option_flags, name_len, fmla_len, extsht_index, sheet_index, internal_name, ) name = internal_name if nobj.builtin: name = builtin_name_from_code.get(name, "??Unknown??") if blah: print(" builtin: %s" % name, file=self.logfile) nobj.name = name nobj.raw_formula = data[pos:] nobj.basic_formula_len = fmla_len nobj.evaluated = 0 if blah: nobj.dump( self.logfile, header="--- handle_name: name[%d] ---" % name_index, footer="-------------------", ) def names_epilogue(self): blah = self.verbosity >= 2 f = self.logfile if blah: print("+++++ names_epilogue +++++", file=f) print("_all_sheets_map", REPR(self._all_sheets_map), file=f) print("_extnsht_name_from_num", REPR(self._extnsht_name_from_num), file=f) print("_sheet_num_from_name", REPR(self._sheet_num_from_name), file=f) num_names = len(self.name_obj_list) for namex in range(num_names): nobj = self.name_obj_list[namex] # Convert from excel_sheet_index to scope. # This is done here because in BIFF7 and earlier, the # BOUNDSHEET records (from which _all_sheets_map is derived) # come after the NAME records. if self.biff_version >= 80: sheet_index = nobj.excel_sheet_index if sheet_index == 0: intl_sheet_index = -1 # global elif 1 <= sheet_index <= len(self._all_sheets_map): intl_sheet_index = self._all_sheets_map[sheet_index - 1] if intl_sheet_index == -1: # maps to a macro or VBA sheet intl_sheet_index = -2 # valid sheet reference but not useful else: # huh? intl_sheet_index = -3 # invalid elif 50 <= self.biff_version <= 70: sheet_index = nobj.extn_sheet_num if sheet_index == 0: intl_sheet_index = -1 # global else: sheet_name = self._extnsht_name_from_num[sheet_index] intl_sheet_index = self._sheet_num_from_name.get(sheet_name, -2) nobj.scope = intl_sheet_index for namex in range(num_names): nobj = self.name_obj_list[namex] # Parse the formula ... if nobj.macro or nobj.binary: continue if nobj.evaluated: continue evaluate_name_formula(self, nobj, namex, blah=blah) if self.verbosity >= 2: print("---------- name object dump ----------", file=f) for namex in range(num_names): nobj = self.name_obj_list[namex] nobj.dump(f, header="--- name[%d] ---" % namex) print("--------------------------------------", file=f) # # Build some dicts for access to the name objects # name_and_scope_map = {} # (name.lower(), scope): Name_object name_map = {} # name.lower() : list of Name_objects (sorted in scope order) for namex in range(num_names): nobj = self.name_obj_list[namex] name_lcase = nobj.name.lower() key = (name_lcase, nobj.scope) if key in name_and_scope_map and self.verbosity: fprintf(f, "Duplicate entry %r in name_and_scope_map\n", key) name_and_scope_map[key] = nobj sort_data = (nobj.scope, namex, nobj) # namex (a temp unique ID) ensures the Name objects will not # be compared (fatal in py3) if name_lcase in name_map: name_map[name_lcase].append(sort_data) else: name_map[name_lcase] = [sort_data] for key in name_map.keys(): alist = name_map[key] alist.sort() name_map[key] = [x[2] for x in alist] self.name_and_scope_map = name_and_scope_map self.name_map = name_map def handle_obj(self, data): # Not doing much handling at all. # Worrying about embedded (BOF ... EOF) substreams is done elsewhere. # DEBUG = 1 obj_type, obj_id = unpack("<HI", data[4:10]) # if DEBUG: print "---> handle_obj type=%d id=0x%08x" % (obj_type, obj_id) def handle_supbook(self, data): # aka EXTERNALBOOK in OOo docs self._supbook_types.append(None) blah = DEBUG or self.verbosity >= 2 if blah: print("SUPBOOK:", file=self.logfile) hex_char_dump(data, 0, len(data), fout=self.logfile) num_sheets = unpack("<H", data[0:2])[0] if blah: print("num_sheets = %d" % num_sheets, file=self.logfile) sbn = self._supbook_count self._supbook_count += 1 if data[2:4] == b"\x01\x04": self._supbook_types[-1] = SUPBOOK_INTERNAL self._supbook_locals_inx = self._supbook_count - 1 if blah: print( "SUPBOOK[%d]: internal 3D refs; %d sheets" % (sbn, num_sheets), file=self.logfile, ) print(" _all_sheets_map", self._all_sheets_map, file=self.logfile) return if data[0:4] == b"\x01\x00\x01\x3A": self._supbook_types[-1] = SUPBOOK_ADDIN self._supbook_addins_inx = self._supbook_count - 1 if blah: print("SUPBOOK[%d]: add-in functions" % sbn, file=self.logfile) return url, pos = unpack_unicode_update_pos(data, 2, lenlen=2) if num_sheets == 0: self._supbook_types[-1] = SUPBOOK_DDEOLE if blah: fprintf(self.logfile, "SUPBOOK[%d]: DDE/OLE document = %r\n", sbn, url) return self._supbook_types[-1] = SUPBOOK_EXTERNAL if blah: fprintf(self.logfile, "SUPBOOK[%d]: url = %r\n", sbn, url) sheet_names = [] for x in range(num_sheets): try: shname, pos = unpack_unicode_update_pos(data, pos, lenlen=2) except struct.error: # #### FIX ME #### # Should implement handling of CONTINUE record(s) ... if self.verbosity: print( "*** WARNING: unpack failure in sheet %d of %d in SUPBOOK record for file %r" % (x, num_sheets, url), file=self.logfile, ) break sheet_names.append(shname) if blah: fprintf( self.logfile, " sheetx=%d namelen=%d name=%r (next pos=%d)\n", x, len(shname), shname, pos, ) def handle_sheethdr(self, data): # This a BIFF 4W special. # The SHEETHDR record is followed by a (BOF ... EOF) substream containing # a worksheet. # DEBUG = 1 self.derive_encoding() sheet_len = unpack("<i", data[:4])[0] sheet_name = unpack_string(data, 4, self.encoding, lenlen=1) sheetno = self._sheethdr_count assert sheet_name == self._sheet_names[sheetno] self._sheethdr_count += 1 BOF_posn = self._position posn = BOF_posn - 4 - len(data) if DEBUG: fprintf( self.logfile, "SHEETHDR %d at posn %d: len=%d name=%r\n", sheetno, posn, sheet_len, sheet_name, ) self.initialise_format_info() if DEBUG: print("SHEETHDR: xf epilogue flag is %d" % self._xf_epilogue_done, file=self.logfile) self._sheet_list.append(None) # get_sheet updates _sheet_list but needs a None beforehand self.get_sheet(sheetno, update_pos=False) if DEBUG: print("SHEETHDR: posn after get_sheet() =", self._position, file=self.logfile) self._position = BOF_posn + sheet_len def handle_sheetsoffset(self, data): # DEBUG = 0 posn = unpack("<i", data)[0] if DEBUG: print("SHEETSOFFSET:", posn, file=self.logfile) self._sheetsoffset = posn def handle_sst(self, data): # DEBUG = 1 if DEBUG: print("SST Processing", file=self.logfile) t0 = perf_counter() nbt = len(data) strlist = [data] uniquestrings = unpack("<i", data[4:8])[0] if DEBUG or self.verbosity >= 2: fprintf(self.logfile, "SST: unique strings: %d\n", uniquestrings) while 1: code, nb, data = self.get_record_parts_conditional(XL_CONTINUE) if code is None: break nbt += nb if DEBUG >= 2: fprintf(self.logfile, "CONTINUE: adding %d bytes to SST -> %d\n", nb, nbt) strlist.append(data) self._sharedstrings, rt_runlist = unpack_SST_table(strlist, uniquestrings) if self.formatting_info: self._rich_text_runlist_map = rt_runlist if DEBUG: t1 = perf_counter() print(f"SST processing took {t1 - t0:.2f} seconds", file=self.logfile) def handle_writeaccess(self, data): DEBUG = 0 if self.biff_version < 80: if not self.encoding: self.raw_user_name = True self.user_name = data return strg = unpack_string(data, 0, self.encoding, lenlen=1) else: try: strg = unpack_unicode(data, 0, lenlen=2) except UnicodeDecodeError: # may have invalid trailing characters strg = unpack_unicode(data.strip(), 0, lenlen=2) if DEBUG: fprintf( self.logfile, "WRITEACCESS: %d bytes; raw=%s %r\n", len(data), self.raw_user_name, strg, ) strg = strg.rstrip() self.user_name = strg def parse_globals(self): # DEBUG = 0 # no need to position, just start reading (after the BOF) formatting.initialise_book(self) while 1: rc, length, data = self.get_record_parts() if DEBUG: print("parse_globals: record code is 0x%04x" % rc, file=self.logfile) if rc == XL_SST: self.handle_sst(data) elif rc == XL_FONT or rc == XL_FONT_B3B4: self.handle_font(data) elif rc == XL_FORMAT: # XL_FORMAT2 is BIFF <= 3.0, can't appear in globals self.handle_format(data) elif rc == XL_XF: self.handle_xf(data) elif rc == XL_BOUNDSHEET: self.handle_boundsheet(data) elif rc == XL_DATEMODE: self.handle_datemode(data) elif rc == XL_CODEPAGE: self.handle_codepage(data) elif rc == XL_COUNTRY: self.handle_country(data) elif rc == XL_EXTERNNAME: self.handle_externname(data) elif rc == XL_EXTERNSHEET: self.handle_externsheet(data) elif rc == XL_FILEPASS: self.handle_filepass(data) elif rc == XL_WRITEACCESS: self.handle_writeaccess(data) elif rc == XL_SHEETSOFFSET: self.handle_sheetsoffset(data) elif rc == XL_SHEETHDR: self.handle_sheethdr(data) elif rc == XL_SUPBOOK: self.handle_supbook(data) elif rc == XL_NAME: self.handle_name(data) elif rc == XL_PALETTE: self.handle_palette(data) elif rc == XL_STYLE: self.handle_style(data) elif rc & 0xFF == 9 and self.verbosity: fprintf( self.logfile, "*** Unexpected BOF at posn %d: 0x%04x len=%d data=%r\n", self._position - length - 4, rc, length, data, ) elif rc == XL_EOF: self.xf_epilogue() self.names_epilogue() self.palette_epilogue() if not self.encoding: self.derive_encoding() if self.biff_version == 45: # DEBUG = 0 if DEBUG: print("global EOF: position", self._position, file=self.logfile) # if DEBUG: # pos = self._position - 4 # print repr(self.mem[pos:pos+40]) return else: # if DEBUG: # print >> self.logfile, "parse_globals: ignoring record code 0x%04x" % rc pass def read(self, pos, length): data = self.mem[pos : pos + length] self._position = pos + len(data) return data def getbof(self, rqd_stream): # DEBUG = 1 # if DEBUG: print >> self.logfile, "getbof(): position", self._position if DEBUG: print("reqd: 0x%04x" % rqd_stream, file=self.logfile) def bof_error(msg): raise XLRDError("Unsupported format, or corrupt file: " + msg) savpos = self._position opcode = self.get2bytes() if opcode == MY_EOF: bof_error("Expected BOF record; met end of file") if opcode not in bofcodes: bof_error("Expected BOF record; found %r" % self.mem[savpos : savpos + 8]) length = self.get2bytes() if length == MY_EOF: bof_error("Incomplete BOF record[1]; met end of file") if not (4 <= length <= 20): bof_error("Invalid length (%d) for BOF record type 0x%04x" % (length, opcode)) padding = b"\0" * max(0, boflen[opcode] - length) data = self.read(self._position, length) if DEBUG: fprintf(self.logfile, "\ngetbof(): data=%r\n", data) if len(data) < length: bof_error("Incomplete BOF record[2]; met end of file") data += padding version1 = opcode >> 8 version2, streamtype = unpack("<HH", data[0:4]) if DEBUG: print( "getbof(): op=0x%04x version2=0x%04x streamtype=0x%04x" % (opcode, version2, streamtype), file=self.logfile, ) bof_offset = self._position - 4 - length if DEBUG: print( "getbof(): BOF found at offset %d; savpos=%d" % (bof_offset, savpos), file=self.logfile, ) version = build = year = 0 if version1 == 0x08: build, year = unpack("<HH", data[4:8]) if version2 == 0x0600: version = 80 elif version2 == 0x0500: if year < 1994 or build in (2412, 3218, 3321): version = 50 else: version = 70 else: # dodgy one, created by a 3rd-party tool version = { 0x0000: 21, 0x0007: 21, 0x0200: 21, 0x0300: 30, 0x0400: 40, }.get(version2, 0) elif version1 in (0x04, 0x02, 0x00): version = {0x04: 40, 0x02: 30, 0x00: 21}[version1] if version == 40 and streamtype == XL_WORKBOOK_GLOBALS_4W: version = 45 # i.e. 4W if DEBUG or self.verbosity >= 2: print( "BOF: op=0x%04x vers=0x%04x stream=0x%04x buildid=%d buildyr=%d -> BIFF%d" % (opcode, version2, streamtype, build, year, version), file=self.logfile, ) got_globals = streamtype == XL_WORKBOOK_GLOBALS or ( version == 45 and streamtype == XL_WORKBOOK_GLOBALS_4W ) if (rqd_stream == XL_WORKBOOK_GLOBALS and got_globals) or streamtype == rqd_stream: return version if version < 50 and streamtype == XL_WORKSHEET: return version if version >= 50 and streamtype == 0x0100: bof_error("Workspace file -- no spreadsheet data") bof_error( "BOF not workbook/worksheet: op=0x%04x vers=0x%04x strm=0x%04x build=%d year=%d -> BIFF%d" % (opcode, version2, streamtype, build, year, version) )
# === helper functions def expand_cell_address(inrow, incol): # Ref : OOo docs, "4.3.4 Cell Addresses in BIFF8" outrow = inrow if incol & 0x8000: if outrow >= 32768: outrow -= 65536 relrow = 1 else: relrow = 0 outcol = incol & 0xFF if incol & 0x4000: if outcol >= 128: outcol -= 256 relcol = 1 else: relcol = 0 return outrow, outcol, relrow, relcol def colname(colx, _A2Z="ABCDEFGHIJKLMNOPQRSTUVWXYZ"): assert colx >= 0 name = "" while 1: quot, rem = divmod(colx, 26) name = _A2Z[rem] + name if not quot: return name colx = quot - 1 def display_cell_address(rowx, colx, relrow, relcol): if relrow: rowpart = "(*%s%d)" % ("+-"[rowx < 0], abs(rowx)) else: rowpart = "$%d" % (rowx + 1,) if relcol: colpart = "(*%s%d)" % ("+-"[colx < 0], abs(colx)) else: colpart = "$" + colname(colx) return colpart + rowpart
[docs]def unpack_SST_table(datatab, nstrings): "Return list of strings" datainx = 0 ndatas = len(datatab) data = datatab[0] datalen = len(data) pos = 8 strings = [] strappend = strings.append richtext_runs = {} local_unpack = unpack local_min = min local_BYTES_ORD = BYTES_ORD latin_1 = "latin_1" for _unused_i in range(nstrings): nchars = local_unpack("<H", data[pos : pos + 2])[0] pos += 2 options = local_BYTES_ORD(data[pos]) pos += 1 rtcount = 0 phosz = 0 if options & 0x08: # richtext rtcount = local_unpack("<H", data[pos : pos + 2])[0] pos += 2 if options & 0x04: # phonetic phosz = local_unpack("<i", data[pos : pos + 4])[0] pos += 4 accstrg = "" charsgot = 0 while 1: charsneed = nchars - charsgot if options & 0x01: # Uncompressed UTF-16 charsavail = local_min((datalen - pos) >> 1, charsneed) rawstrg = data[pos : pos + 2 * charsavail] # if DEBUG: print "SST U16: nchars=%d pos=%d rawstrg=%r" % (nchars, pos, rawstrg) try: accstrg += str(rawstrg, "utf_16_le") except: # print "SST U16: nchars=%d pos=%d rawstrg=%r" % (nchars, pos, rawstrg) # Probable cause: dodgy data e.g. unfinished surrogate pair. # E.g. file unicode2.xls in pyExcelerator's examples has cells containing # unichr(i) for i in range(0x100000) # so this will include 0xD800 etc raise pos += 2 * charsavail else: # Note: this is COMPRESSED (not ASCII!) encoding!!! charsavail = local_min(datalen - pos, charsneed) rawstrg = data[pos : pos + charsavail] # if DEBUG: print "SST CMPRSD: nchars=%d pos=%d rawstrg=%r" % (nchars, pos, rawstrg) accstrg += str(rawstrg, latin_1) pos += charsavail charsgot += charsavail if charsgot == nchars: break datainx += 1 data = datatab[datainx] datalen = len(data) options = local_BYTES_ORD(data[0]) pos = 1 if rtcount: runs = [] for runindex in range(rtcount): if pos == datalen: pos = 0 datainx += 1 data = datatab[datainx] datalen = len(data) runs.append(local_unpack("<HH", data[pos : pos + 4])) pos += 4 richtext_runs[len(strings)] = runs pos += phosz # size of the phonetic stuff to skip if pos >= datalen: # adjust to correct position in next record pos = pos - datalen datainx += 1 if datainx < ndatas: data = datatab[datainx] datalen = len(data) else: assert _unused_i == nstrings - 1 strappend(accstrg) return strings, richtext_runs