Lib.xlrd.book.py Maven / Gradle / Ivy
# Copyright (c) 2005-2012 Stephen John Machin, Lingfo Pty Ltd
# This module is part of the xlrd package, which is released under a
# BSD-style licence.
from __future__ import print_function
from .timemachine import *
from .biffh import *
import struct; unpack = struct.unpack
import sys
import time
from . import sheet
from . import compdoc
from .formula import *
from . import formatting
if sys.version.startswith("IronPython"):
# print >> sys.stderr, "...importing encodings"
import encodings
empty_cell = sheet.empty_cell # for exposure to the world ...
DEBUG = 0
USE_FANCY_CD = 1
TOGGLE_GC = 0
import gc
# gc.set_debug(gc.DEBUG_STATS)
try:
import mmap
MMAP_AVAILABLE = 1
except ImportError:
MMAP_AVAILABLE = 0
USE_MMAP = MMAP_AVAILABLE
MY_EOF = 0xF00BAAA # not a 16-bit number
SUPBOOK_UNK, SUPBOOK_INTERNAL, SUPBOOK_EXTERNAL, SUPBOOK_ADDIN, SUPBOOK_DDEOLE = range(5)
SUPPORTED_VERSIONS = (80, 70, 50, 45, 40, 30, 21, 20)
_code_from_builtin_name = {
"Consolidate_Area": "\x00",
"Auto_Open": "\x01",
"Auto_Close": "\x02",
"Extract": "\x03",
"Database": "\x04",
"Criteria": "\x05",
"Print_Area": "\x06",
"Print_Titles": "\x07",
"Recorder": "\x08",
"Data_Form": "\x09",
"Auto_Activate": "\x0A",
"Auto_Deactivate": "\x0B",
"Sheet_Title": "\x0C",
"_FilterDatabase": "\x0D",
}
builtin_name_from_code = {}
code_from_builtin_name = {}
for _bin, _bic in _code_from_builtin_name.items():
_bin = UNICODE_LITERAL(_bin)
_bic = UNICODE_LITERAL(_bic)
code_from_builtin_name[_bin] = _bic
builtin_name_from_code[_bic] = _bin
del _bin, _bic, _code_from_builtin_name
def open_workbook_xls(filename=None,
logfile=sys.stdout, verbosity=0, use_mmap=USE_MMAP,
file_contents=None,
encoding_override=None,
formatting_info=False, on_demand=False, ragged_rows=False,
):
t0 = time.clock()
if TOGGLE_GC:
orig_gc_enabled = gc.isenabled()
if orig_gc_enabled:
gc.disable()
bk = Book()
try:
bk.biff2_8_load(
filename=filename, file_contents=file_contents,
logfile=logfile, verbosity=verbosity, use_mmap=use_mmap,
encoding_override=encoding_override,
formatting_info=formatting_info,
on_demand=on_demand,
ragged_rows=ragged_rows,
)
t1 = time.clock()
bk.load_time_stage_1 = t1 - t0
biff_version = bk.getbof(XL_WORKBOOK_GLOBALS)
if not biff_version:
raise XLRDError("Can't determine file's BIFF version")
if biff_version not in SUPPORTED_VERSIONS:
raise XLRDError(
"BIFF version %s is not supported"
% biff_text_from_num[biff_version]
)
bk.biff_version = biff_version
if biff_version <= 40:
# no workbook globals, only 1 worksheet
if on_demand:
fprintf(bk.logfile,
"*** WARNING: on_demand is not supported for this Excel version.\n"
"*** Setting on_demand to False.\n")
bk.on_demand = on_demand = False
bk.fake_globals_get_sheet()
elif biff_version == 45:
# worksheet(s) embedded in global stream
bk.parse_globals()
if on_demand:
fprintf(bk.logfile, "*** WARNING: on_demand is not supported for this Excel version.\n"
"*** Setting on_demand to False.\n")
bk.on_demand = on_demand = False
else:
bk.parse_globals()
bk._sheet_list = [None for sh in bk._sheet_names]
if not on_demand:
bk.get_sheets()
bk.nsheets = len(bk._sheet_list)
if biff_version == 45 and bk.nsheets > 1:
fprintf(bk.logfile,
"*** WARNING: Excel 4.0 workbook (.XLW) file contains %d worksheets.\n"
"*** Book-level data will be that of the last worksheet.\n",
bk.nsheets
)
if TOGGLE_GC:
if orig_gc_enabled:
gc.enable()
t2 = time.clock()
bk.load_time_stage_2 = t2 - t1
except:
bk.release_resources()
raise
# normal exit
if not on_demand:
bk.release_resources()
return bk
##
# For debugging: dump the file's BIFF records in char & hex.
# @param filename The path to the file to be dumped.
# @param outfile An open file, to which the dump is written.
# @param unnumbered If true, omit offsets (for meaningful diffs).
def dump(filename, outfile=sys.stdout, unnumbered=False):
bk = Book()
bk.biff2_8_load(filename=filename, logfile=outfile, )
biff_dump(bk.mem, bk.base, bk.stream_len, 0, outfile, unnumbered)
##
# For debugging and analysis: summarise the file's BIFF records.
# I.e. produce a sorted file of (record_name, count).
# @param filename The path to the file to be summarised.
# @param outfile An open file, to which the summary is written.
def count_records(filename, outfile=sys.stdout):
bk = Book()
bk.biff2_8_load(filename=filename, logfile=outfile, )
biff_count_records(bk.mem, bk.base, bk.stream_len, outfile)
##
# Information relating to a named reference, formula, macro, etc.
#
-- New in version 0.6.0
#
-- Name information is not extracted from files older than
# Excel 5.0 (Book.biff_version < 50)
class Name(BaseObject):
_repr_these = ['stack']
book = None # parent
##
# 0 = Visible; 1 = Hidden
hidden = 0
##
# 0 = Command macro; 1 = Function macro. Relevant only if macro == 1
func = 0
##
# 0 = Sheet macro; 1 = VisualBasic macro. Relevant only if macro == 1
vbasic = 0
##
# 0 = Standard name; 1 = Macro name
macro = 0
##
# 0 = Simple formula; 1 = Complex formula (array formula or user defined)
# No examples have been sighted.
complex = 0
##
# 0 = User-defined name; 1 = Built-in name
# (common examples: Print_Area, Print_Titles; see OOo docs for full list)
builtin = 0
##
# Function group. Relevant only if macro == 1; see OOo docs for values.
funcgroup = 0
##
# 0 = Formula definition; 1 = Binary data
No examples have been sighted.
binary = 0
##
# The index of this object in book.name_obj_list
name_index = 0
##
# A Unicode string. If builtin, decoded as per OOo docs.
name = UNICODE_LITERAL("")
##
# An 8-bit string.
raw_formula = b''
##
# -1: The name is global (visible in all calculation sheets).
# -2: The name belongs to a macro sheet or VBA sheet.
# -3: The name is invalid.
# 0 <= scope < book.nsheets: The name is local to the sheet whose index is scope.
scope = -1
##
# The result of evaluating the formula, if any.
# If no formula, or evaluation of the formula encountered problems,
# the result is None. Otherwise the result is a single instance of the
# Operand class.
#
result = None
##
# This is a convenience method for the frequent use case where the name
# refers to a single cell.
# @return An instance of the Cell class.
# @throws XLRDError The name is not a constant absolute reference
# to a single cell.
def cell(self):
res = self.result
if res:
# result should be an instance of the Operand class
kind = res.kind
value = res.value
if kind == oREF and len(value) == 1:
ref3d = value[0]
if (0 <= ref3d.shtxlo == ref3d.shtxhi - 1
and ref3d.rowxlo == ref3d.rowxhi - 1
and ref3d.colxlo == ref3d.colxhi - 1):
sh = self.book.sheet_by_index(ref3d.shtxlo)
return sh.cell(ref3d.rowxlo, ref3d.colxlo)
self.dump(self.book.logfile,
header="=== Dump of Name object ===",
footer="======= End of dump =======",
)
raise XLRDError("Not a constant absolute reference to a single cell")
##
# This is a convenience method for the use case where the name
# refers to one rectangular area in one worksheet.
# @param clipped If true (the default), the returned rectangle is clipped
# to fit in (0, sheet.nrows, 0, sheet.ncols) -- it is guaranteed that
# 0 <= rowxlo <= rowxhi <= sheet.nrows and that the number of usable rows
# in the area (which may be zero) is rowxhi - rowxlo; likewise for columns.
# @return a tuple (sheet_object, rowxlo, rowxhi, colxlo, colxhi).
# @throws XLRDError The name is not a constant absolute reference
# to a single area in a single sheet.
def area2d(self, clipped=True):
res = self.result
if res:
# result should be an instance of the Operand class
kind = res.kind
value = res.value
if kind == oREF and len(value) == 1: # only 1 reference
ref3d = value[0]
if 0 <= ref3d.shtxlo == ref3d.shtxhi - 1: # only 1 usable sheet
sh = self.book.sheet_by_index(ref3d.shtxlo)
if not clipped:
return sh, ref3d.rowxlo, ref3d.rowxhi, ref3d.colxlo, ref3d.colxhi
rowxlo = min(ref3d.rowxlo, sh.nrows)
rowxhi = max(rowxlo, min(ref3d.rowxhi, sh.nrows))
colxlo = min(ref3d.colxlo, sh.ncols)
colxhi = max(colxlo, min(ref3d.colxhi, sh.ncols))
assert 0 <= rowxlo <= rowxhi <= sh.nrows
assert 0 <= colxlo <= colxhi <= sh.ncols
return sh, rowxlo, rowxhi, colxlo, colxhi
self.dump(self.book.logfile,
header="=== Dump of Name object ===",
footer="======= End of dump =======",
)
raise XLRDError("Not a constant absolute reference to a single area in a single sheet")
##
# Contents of a "workbook".
# WARNING: You don't call this class yourself. You use the Book object that
# was returned when you called xlrd.open_workbook("myfile.xls").
class Book(BaseObject):
##
# The number of worksheets present in the workbook file.
# This information is available even when no sheets have yet been loaded.
nsheets = 0
##
# Which date system was in force when this file was last saved.
# 0 => 1900 system (the Excel for Windows default).
# 1 => 1904 system (the Excel for Macintosh default).
datemode = 0 # In case it's not specified in the file.
##
# Version of BIFF (Binary Interchange File Format) used to create the file.
# Latest is 8.0 (represented here as 80), introduced with Excel 97.
# Earliest supported by this module: 2.0 (represented as 20).
biff_version = 0
##
# List containing a Name object for each NAME record in the workbook.
#
-- New in version 0.6.0
name_obj_list = []
##
# An integer denoting the character set used for strings in this file.
# For BIFF 8 and later, this will be 1200, meaning Unicode; more precisely, UTF_16_LE.
# For earlier versions, this is used to derive the appropriate Python encoding
# to be used to convert to Unicode.
# Examples: 1252 -> 'cp1252', 10000 -> 'mac_roman'
codepage = None
##
# The encoding that was derived from the codepage.
encoding = None
##
# A tuple containing the (telephone system) country code for:
# [0]: the user-interface setting when the file was created.
# [1]: the regional settings.
# Example: (1, 61) meaning (USA, Australia).
# This information may give a clue to the correct encoding for an unknown codepage.
# For a long list of observed values, refer to the OpenOffice.org documentation for
# the COUNTRY record.
countries = (0, 0)
##
# What (if anything) is recorded as the name of the last user to save the file.
user_name = UNICODE_LITERAL('')
##
# A list of Font class instances, each corresponding to a FONT record.
#
-- New in version 0.6.1
font_list = []
##
# A list of XF class instances, each corresponding to an XF record.
#
-- New in version 0.6.1
xf_list = []
##
# A list of Format objects, each corresponding to a FORMAT record, in
# the order that they appear in the input file.
# It does not contain builtin formats.
# If you are creating an output file using (for example) pyExcelerator,
# use this list.
# The collection to be used for all visual rendering purposes is format_map.
#
-- New in version 0.6.1
format_list = []
##
# The mapping from XF.format_key to Format object.
#
-- New in version 0.6.1
format_map = {}
##
# This provides access via name to the extended format information for
# both built-in styles and user-defined styles.
# It maps name to (built_in, xf_index), where:
# name is either the name of a user-defined style,
# or the name of one of the built-in styles. Known built-in names are
# Normal, RowLevel_1 to RowLevel_7,
# ColLevel_1 to ColLevel_7, Comma, Currency, Percent, "Comma [0]",
# "Currency [0]", Hyperlink, and "Followed Hyperlink".
# built_in 1 = built-in style, 0 = user-defined
# xf_index is an index into Book.xf_list.
# References: OOo docs s6.99 (STYLE record); Excel UI Format/Style
#
-- New in version 0.6.1; since 0.7.4, extracted only if
# open_workbook(..., formatting_info=True)
style_name_map = {}
##
# This provides definitions for colour indexes. Please refer to the
# above section "The Palette; Colour Indexes" for an explanation
# of how colours are represented in Excel.
# Colour indexes into the palette map into (red, green, blue) tuples.
# "Magic" indexes e.g. 0x7FFF map to None.
# colour_map is what you need if you want to render cells on screen or in a PDF
# file. If you are writing an output XLS file, use palette_record.
#
-- New in version 0.6.1. Extracted only if open_workbook(..., formatting_info=True)
colour_map = {}
##
# If the user has changed any of the colours in the standard palette, the XLS
# file will contain a PALETTE record with 56 (16 for Excel 4.0 and earlier)
# RGB values in it, and this list will be e.g. [(r0, b0, g0), ..., (r55, b55, g55)].
# Otherwise this list will be empty. This is what you need if you are
# writing an output XLS file. If you want to render cells on screen or in a PDF
# file, use colour_map.
#
-- New in version 0.6.1. Extracted only if open_workbook(..., formatting_info=True)
palette_record = []
##
# Time in seconds to extract the XLS image as a contiguous string (or mmap equivalent).
load_time_stage_1 = -1.0
##
# Time in seconds to parse the data from the contiguous string (or mmap equivalent).
load_time_stage_2 = -1.0
##
# @return A list of all sheets in the book.
# All sheets not already loaded will be loaded.
def sheets(self):
for sheetx in xrange(self.nsheets):
if not self._sheet_list[sheetx]:
self.get_sheet(sheetx)
return self._sheet_list[:]
##
# @param sheetx Sheet index in range(nsheets)
# @return An object of the Sheet class
def sheet_by_index(self, sheetx):
return self._sheet_list[sheetx] or self.get_sheet(sheetx)
##
# @param sheet_name Name of sheet required
# @return An object of the Sheet class
def sheet_by_name(self, sheet_name):
try:
sheetx = self._sheet_names.index(sheet_name)
except ValueError:
raise XLRDError('No sheet named <%r>' % sheet_name)
return self.sheet_by_index(sheetx)
##
# @return A list of the names of all the worksheets in the workbook file.
# This information is available even when no sheets have yet been loaded.
def sheet_names(self):
return self._sheet_names[:]
##
# @param sheet_name_or_index Name or index of sheet enquired upon
# @return true if sheet is loaded, false otherwise
#
-- New in version 0.7.1
def sheet_loaded(self, sheet_name_or_index):
if isinstance(sheet_name_or_index, int):
sheetx = sheet_name_or_index
else:
try:
sheetx = self._sheet_names.index(sheet_name_or_index)
except ValueError:
raise XLRDError('No sheet named <%r>' % sheet_name_or_index)
return bool(self._sheet_list[sheetx])
##
# @param sheet_name_or_index Name or index of sheet to be unloaded.
#
-- New in version 0.7.1
def unload_sheet(self, sheet_name_or_index):
if isinstance(sheet_name_or_index, int):
sheetx = sheet_name_or_index
else:
try:
sheetx = self._sheet_names.index(sheet_name_or_index)
except ValueError:
raise XLRDError('No sheet named <%r>' % sheet_name_or_index)
self._sheet_list[sheetx] = None
##
# This method has a dual purpose. You can call it to release
# memory-consuming objects and (possibly) a memory-mapped file
# (mmap.mmap object) when you have finished loading sheets in
# on_demand mode, but still require the Book object to examine the
# loaded sheets. It is also called automatically (a) when open_workbook
# raises an exception and (b) if you are using a "with" statement, when
# the "with" block is exited. Calling this method multiple times on the
# same object has no ill effect.
def release_resources(self):
self._resources_released = 1
if hasattr(self.mem, "close"):
# must be a mmap.mmap object
self.mem.close()
self.mem = None
if hasattr(self.filestr, "close"):
self.filestr.close()
self.filestr = None
self._sharedstrings = None
self._rich_text_runlist_map = None
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, exc_tb):
self.release_resources()
# return false
##
# A mapping from (lower_case_name, scope) to a single Name object.
#
-- New in version 0.6.0
name_and_scope_map = {}
##
# A mapping from lower_case_name to a list of Name objects. The list is
# sorted in scope order. Typically there will be one item (of global scope)
# in the list.
#
-- New in version 0.6.0
name_map = {}
def __init__(self):
self._sheet_list = []
self._sheet_names = []
self._sheet_visibility = [] # from BOUNDSHEET record
self.nsheets = 0
self._sh_abs_posn = [] # sheet's absolute position in the stream
self._sharedstrings = []
self._rich_text_runlist_map = {}
self.raw_user_name = False
self._sheethdr_count = 0 # BIFF 4W only
self.builtinfmtcount = -1 # unknown as yet. BIFF 3, 4S, 4W
self.initialise_format_info()
self._all_sheets_count = 0 # includes macro & VBA sheets
self._supbook_count = 0
self._supbook_locals_inx = None
self._supbook_addins_inx = None
self._all_sheets_map = [] # maps an all_sheets index to a calc-sheets index (or -1)
self._externsheet_info = []
self._externsheet_type_b57 = []
self._extnsht_name_from_num = {}
self._sheet_num_from_name = {}
self._extnsht_count = 0
self._supbook_types = []
self._resources_released = 0
self.addin_func_names = []
self.name_obj_list = []
self.colour_map = {}
self.palette_record = []
self.xf_list = []
self.style_name_map = {}
self.mem = b''
self.filestr = b''
def biff2_8_load(self, filename=None, file_contents=None,
logfile=sys.stdout, verbosity=0, use_mmap=USE_MMAP,
encoding_override=None,
formatting_info=False,
on_demand=False,
ragged_rows=False,
):
# DEBUG = 0
self.logfile = logfile
self.verbosity = verbosity
self.use_mmap = use_mmap and MMAP_AVAILABLE
self.encoding_override = encoding_override
self.formatting_info = formatting_info
self.on_demand = on_demand
self.ragged_rows = ragged_rows
if not file_contents:
with open(filename, "rb") as f:
f.seek(0, 2) # EOF
size = f.tell()
f.seek(0, 0) # BOF
if size == 0:
raise XLRDError("File size is 0 bytes")
if self.use_mmap:
self.filestr = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)
self.stream_len = size
else:
self.filestr = f.read()
self.stream_len = len(self.filestr)
else:
self.filestr = file_contents
self.stream_len = len(file_contents)
self.base = 0
if self.filestr[:8] != compdoc.SIGNATURE:
# got this one at the antique store
self.mem = self.filestr
else:
cd = compdoc.CompDoc(self.filestr, logfile=self.logfile)
if USE_FANCY_CD:
for qname in ['Workbook', 'Book']:
self.mem, self.base, self.stream_len = \
cd.locate_named_stream(UNICODE_LITERAL(qname))
if self.mem: break
else:
raise XLRDError("Can't find workbook in OLE2 compound document")
else:
for qname in ['Workbook', 'Book']:
self.mem = cd.get_named_stream(UNICODE_LITERAL(qname))
if self.mem: break
else:
raise XLRDError("Can't find workbook in OLE2 compound document")
self.stream_len = len(self.mem)
del cd
if self.mem is not self.filestr:
if hasattr(self.filestr, "close"):
self.filestr.close()
self.filestr = b''
self._position = self.base
if DEBUG:
print("mem: %s, base: %d, len: %d" % (type(self.mem), self.base, self.stream_len), file=self.logfile)
def initialise_format_info(self):
# needs to be done once per sheet for BIFF 4W :-(
self.format_map = {}
self.format_list = []
self.xfcount = 0
self.actualfmtcount = 0 # number of FORMAT records seen so far
self._xf_index_to_xl_type_map = {0: XL_CELL_NUMBER}
self._xf_epilogue_done = 0
self.xf_list = []
self.font_list = []
def get2bytes(self):
pos = self._position
buff_two = self.mem[pos:pos+2]
lenbuff = len(buff_two)
self._position += lenbuff
if lenbuff < 2:
return MY_EOF
lo, hi = buff_two
return (BYTES_ORD(hi) << 8) | BYTES_ORD(lo)
def get_record_parts(self):
pos = self._position
mem = self.mem
code, length = unpack('= 2:
fprintf(self.logfile,
"BOUNDSHEET: inx=%d vis=%r sheet_name=%r abs_posn=%d sheet_type=0x%02x\n",
self._all_sheets_count, visibility, sheet_name, abs_posn, sheet_type)
self._all_sheets_count += 1
if sheet_type != XL_BOUNDSHEET_WORKSHEET:
self._all_sheets_map.append(-1)
descr = {
1: 'Macro sheet',
2: 'Chart',
6: 'Visual Basic module',
}.get(sheet_type, 'UNKNOWN')
if DEBUG or self.verbosity >= 1:
fprintf(self.logfile,
"NOTE *** Ignoring non-worksheet data named %r (type 0x%02x = %s)\n",
sheet_name, sheet_type, descr)
else:
snum = len(self._sheet_names)
self._all_sheets_map.append(snum)
self._sheet_names.append(sheet_name)
self._sh_abs_posn.append(abs_posn)
self._sheet_visibility.append(visibility)
self._sheet_num_from_name[sheet_name] = snum
def handle_builtinfmtcount(self, data):
### N.B. This count appears to be utterly useless.
# DEBUG = 1
builtinfmtcount = unpack('= 2:
fprintf(self.logfile, "*** No CODEPAGE record; assuming 1200 (utf_16_le)\n")
else:
codepage = self.codepage
if codepage in encoding_from_codepage:
encoding = encoding_from_codepage[codepage]
elif 300 <= codepage <= 1999:
encoding = 'cp' + str(codepage)
else:
encoding = 'unknown_codepage_' + str(codepage)
if DEBUG or (self.verbosity and encoding != self.encoding) :
fprintf(self.logfile, "CODEPAGE: codepage %r -> encoding %r\n", codepage, encoding)
self.encoding = encoding
if self.codepage != 1200: # utf_16_le
# If we don't have a codec that can decode ASCII into Unicode,
# we're well & truly stuffed -- let the punter know ASAP.
try:
_unused = unicode(b'trial', self.encoding)
except BaseException as e:
fprintf(self.logfile,
"ERROR *** codepage %r -> encoding %r -> %s: %s\n",
self.codepage, self.encoding, type(e).__name__.split(".")[-1], e)
raise
if self.raw_user_name:
strg = unpack_string(self.user_name, 0, self.encoding, lenlen=1)
strg = strg.rstrip()
# if DEBUG:
# print "CODEPAGE: user name decoded from %r to %r" % (self.user_name, strg)
self.user_name = strg
self.raw_user_name = False
return self.encoding
def handle_codepage(self, data):
# DEBUG = 0
codepage = unpack('= 2
if self.biff_version >= 80:
option_flags, other_info =unpack("= 1
blah2 = DEBUG or self.verbosity >= 2
if self.biff_version >= 80:
num_refs = unpack("= 2:
logf = self.logfile
fprintf(logf, "FILEPASS:\n")
hex_char_dump(data, 0, len(data), base=0, fout=logf)
if self.biff_version >= 80:
kind1, = unpack('= 2
bv = self.biff_version
if bv < 50:
return
self.derive_encoding()
# print
# hex_char_dump(data, 0, len(data), fout=self.logfile)
(
option_flags, kb_shortcut, name_len, fmla_len, extsht_index, sheet_index,
menu_text_len, description_text_len, help_topic_text_len, status_bar_text_len,
) = unpack("> nshift)
macro_flag = " M"[nobj.macro]
if bv < 80:
internal_name, pos = unpack_string_update_pos(data, 14, self.encoding, known_len=name_len)
else:
internal_name, pos = unpack_unicode_update_pos(data, 14, known_len=name_len)
nobj.extn_sheet_num = extsht_index
nobj.excel_sheet_index = sheet_index
nobj.scope = None # patched up in the names_epilogue() method
if blah:
fprintf(
self.logfile,
"NAME[%d]:%s oflags=%d, name_len=%d, fmla_len=%d, extsht_index=%d, sheet_index=%d, name=%r\n",
name_index, macro_flag, option_flags, name_len,
fmla_len, extsht_index, sheet_index, internal_name)
name = internal_name
if nobj.builtin:
name = builtin_name_from_code.get(name, "??Unknown??")
if blah: print(" builtin: %s" % name, file=self.logfile)
nobj.name = name
nobj.raw_formula = data[pos:]
nobj.basic_formula_len = fmla_len
nobj.evaluated = 0
if blah:
nobj.dump(
self.logfile,
header="--- handle_name: name[%d] ---" % name_index,
footer="-------------------",
)
def names_epilogue(self):
blah = self.verbosity >= 2
f = self.logfile
if blah:
print("+++++ names_epilogue +++++", file=f)
print("_all_sheets_map", REPR(self._all_sheets_map), file=f)
print("_extnsht_name_from_num", REPR(self._extnsht_name_from_num), file=f)
print("_sheet_num_from_name", REPR(self._sheet_num_from_name), file=f)
num_names = len(self.name_obj_list)
for namex in range(num_names):
nobj = self.name_obj_list[namex]
# Convert from excel_sheet_index to scope.
# This is done here because in BIFF7 and earlier, the
# BOUNDSHEET records (from which _all_sheets_map is derived)
# come after the NAME records.
if self.biff_version >= 80:
sheet_index = nobj.excel_sheet_index
if sheet_index == 0:
intl_sheet_index = -1 # global
elif 1 <= sheet_index <= len(self._all_sheets_map):
intl_sheet_index = self._all_sheets_map[sheet_index-1]
if intl_sheet_index == -1: # maps to a macro or VBA sheet
intl_sheet_index = -2 # valid sheet reference but not useful
else:
# huh?
intl_sheet_index = -3 # invalid
elif 50 <= self.biff_version <= 70:
sheet_index = nobj.extn_sheet_num
if sheet_index == 0:
intl_sheet_index = -1 # global
else:
sheet_name = self._extnsht_name_from_num[sheet_index]
intl_sheet_index = self._sheet_num_from_name.get(sheet_name, -2)
nobj.scope = intl_sheet_index
for namex in range(num_names):
nobj = self.name_obj_list[namex]
# Parse the formula ...
if nobj.macro or nobj.binary: continue
if nobj.evaluated: continue
evaluate_name_formula(self, nobj, namex, blah=blah)
if self.verbosity >= 2:
print("---------- name object dump ----------", file=f)
for namex in range(num_names):
nobj = self.name_obj_list[namex]
nobj.dump(f, header="--- name[%d] ---" % namex)
print("--------------------------------------", file=f)
#
# Build some dicts for access to the name objects
#
name_and_scope_map = {} # (name.lower(), scope): Name_object
name_map = {} # name.lower() : list of Name_objects (sorted in scope order)
for namex in range(num_names):
nobj = self.name_obj_list[namex]
name_lcase = nobj.name.lower()
key = (name_lcase, nobj.scope)
if key in name_and_scope_map and self.verbosity:
fprintf(f, 'Duplicate entry %r in name_and_scope_map\n', key)
name_and_scope_map[key] = nobj
sort_data = (nobj.scope, namex, nobj)
# namex (a temp unique ID) ensures the Name objects will not
# be compared (fatal in py3)
if name_lcase in name_map:
name_map[name_lcase].append(sort_data)
else:
name_map[name_lcase] = [sort_data]
for key in name_map.keys():
alist = name_map[key]
alist.sort()
name_map[key] = [x[2] for x in alist]
self.name_and_scope_map = name_and_scope_map
self.name_map = name_map
def handle_obj(self, data):
# Not doing much handling at all.
# Worrying about embedded (BOF ... EOF) substreams is done elsewhere.
# DEBUG = 1
obj_type, obj_id = unpack(' handle_obj type=%d id=0x%08x" % (obj_type, obj_id)
def handle_supbook(self, data):
# aka EXTERNALBOOK in OOo docs
self._supbook_types.append(None)
blah = DEBUG or self.verbosity >= 2
if blah:
print("SUPBOOK:", file=self.logfile)
hex_char_dump(data, 0, len(data), fout=self.logfile)
num_sheets = unpack("= 2:
fprintf(self.logfile, "SST: unique strings: %d\n", uniquestrings)
while 1:
code, nb, data = self.get_record_parts_conditional(XL_CONTINUE)
if code is None:
break
nbt += nb
if DEBUG >= 2:
fprintf(self.logfile, "CONTINUE: adding %d bytes to SST -> %d\n", nb, nbt)
strlist.append(data)
self._sharedstrings, rt_runlist = unpack_SST_table(strlist, uniquestrings)
if self.formatting_info:
self._rich_text_runlist_map = rt_runlist
if DEBUG:
t1 = time.time()
print("SST processing took %.2f seconds" % (t1 - t0, ), file=self.logfile)
def handle_writeaccess(self, data):
DEBUG = 0
if self.biff_version < 80:
if not self.encoding:
self.raw_user_name = True
self.user_name = data
return
strg = unpack_string(data, 0, self.encoding, lenlen=1)
else:
strg = unpack_unicode(data, 0, lenlen=2)
if DEBUG: fprintf(self.logfile, "WRITEACCESS: %d bytes; raw=%s %r\n", len(data), self.raw_user_name, strg)
strg = strg.rstrip()
self.user_name = strg
def parse_globals(self):
# DEBUG = 0
# no need to position, just start reading (after the BOF)
formatting.initialise_book(self)
while 1:
rc, length, data = self.get_record_parts()
if DEBUG: print("parse_globals: record code is 0x%04x" % rc, file=self.logfile)
if rc == XL_SST:
self.handle_sst(data)
elif rc == XL_FONT or rc == XL_FONT_B3B4:
self.handle_font(data)
elif rc == XL_FORMAT: # XL_FORMAT2 is BIFF <= 3.0, can't appear in globals
self.handle_format(data)
elif rc == XL_XF:
self.handle_xf(data)
elif rc == XL_BOUNDSHEET:
self.handle_boundsheet(data)
elif rc == XL_DATEMODE:
self.handle_datemode(data)
elif rc == XL_CODEPAGE:
self.handle_codepage(data)
elif rc == XL_COUNTRY:
self.handle_country(data)
elif rc == XL_EXTERNNAME:
self.handle_externname(data)
elif rc == XL_EXTERNSHEET:
self.handle_externsheet(data)
elif rc == XL_FILEPASS:
self.handle_filepass(data)
elif rc == XL_WRITEACCESS:
self.handle_writeaccess(data)
elif rc == XL_SHEETSOFFSET:
self.handle_sheetsoffset(data)
elif rc == XL_SHEETHDR:
self.handle_sheethdr(data)
elif rc == XL_SUPBOOK:
self.handle_supbook(data)
elif rc == XL_NAME:
self.handle_name(data)
elif rc == XL_PALETTE:
self.handle_palette(data)
elif rc == XL_STYLE:
self.handle_style(data)
elif rc & 0xff == 9 and self.verbosity:
fprintf(self.logfile, "*** Unexpected BOF at posn %d: 0x%04x len=%d data=%r\n",
self._position - length - 4, rc, length, data)
elif rc == XL_EOF:
self.xf_epilogue()
self.names_epilogue()
self.palette_epilogue()
if not self.encoding:
self.derive_encoding()
if self.biff_version == 45:
# DEBUG = 0
if DEBUG: print("global EOF: position", self._position, file=self.logfile)
# if DEBUG:
# pos = self._position - 4
# print repr(self.mem[pos:pos+40])
return
else:
# if DEBUG:
# print >> self.logfile, "parse_globals: ignoring record code 0x%04x" % rc
pass
def read(self, pos, length):
data = self.mem[pos:pos+length]
self._position = pos + len(data)
return data
def getbof(self, rqd_stream):
# DEBUG = 1
# if DEBUG: print >> self.logfile, "getbof(): position", self._position
if DEBUG: print("reqd: 0x%04x" % rqd_stream, file=self.logfile)
def bof_error(msg):
raise XLRDError('Unsupported format, or corrupt file: ' + msg)
savpos = self._position
opcode = self.get2bytes()
if opcode == MY_EOF:
bof_error('Expected BOF record; met end of file')
if opcode not in bofcodes:
bof_error('Expected BOF record; found %r' % self.mem[savpos:savpos+8])
length = self.get2bytes()
if length == MY_EOF:
bof_error('Incomplete BOF record[1]; met end of file')
if not (4 <= length <= 20):
bof_error(
'Invalid length (%d) for BOF record type 0x%04x'
% (length, opcode))
padding = b'\0' * max(0, boflen[opcode] - length)
data = self.read(self._position, length);
if DEBUG: fprintf(self.logfile, "\ngetbof(): data=%r\n", data)
if len(data) < length:
bof_error('Incomplete BOF record[2]; met end of file')
data += padding
version1 = opcode >> 8
version2, streamtype = unpack('= 2:
print("BOF: op=0x%04x vers=0x%04x stream=0x%04x buildid=%d buildyr=%d -> BIFF%d" \
% (opcode, version2, streamtype, build, year, version), file=self.logfile)
got_globals = streamtype == XL_WORKBOOK_GLOBALS or (
version == 45 and streamtype == XL_WORKBOOK_GLOBALS_4W)
if (rqd_stream == XL_WORKBOOK_GLOBALS and got_globals) or streamtype == rqd_stream:
return version
if version < 50 and streamtype == XL_WORKSHEET:
return version
if version >= 50 and streamtype == 0x0100:
bof_error("Workspace file -- no spreadsheet data")
bof_error(
'BOF not workbook/worksheet: op=0x%04x vers=0x%04x strm=0x%04x build=%d year=%d -> BIFF%d' \
% (opcode, version2, streamtype, build, year, version)
)
# === helper functions
def expand_cell_address(inrow, incol):
# Ref : OOo docs, "4.3.4 Cell Addresses in BIFF8"
outrow = inrow
if incol & 0x8000:
if outrow >= 32768:
outrow -= 65536
relrow = 1
else:
relrow = 0
outcol = incol & 0xFF
if incol & 0x4000:
if outcol >= 128:
outcol -= 256
relcol = 1
else:
relcol = 0
return outrow, outcol, relrow, relcol
def colname(colx, _A2Z="ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
assert colx >= 0
name = UNICODE_LITERAL('')
while 1:
quot, rem = divmod(colx, 26)
name = _A2Z[rem] + name
if not quot:
return name
colx = quot - 1
def display_cell_address(rowx, colx, relrow, relcol):
if relrow:
rowpart = "(*%s%d)" % ("+-"[rowx < 0], abs(rowx))
else:
rowpart = "$%d" % (rowx+1,)
if relcol:
colpart = "(*%s%d)" % ("+-"[colx < 0], abs(colx))
else:
colpart = "$" + colname(colx)
return colpart + rowpart
def unpack_SST_table(datatab, nstrings):
"Return list of strings"
datainx = 0
ndatas = len(datatab)
data = datatab[0]
datalen = len(data)
pos = 8
strings = []
strappend = strings.append
richtext_runs = {}
local_unpack = unpack
local_min = min
local_BYTES_ORD = BYTES_ORD
latin_1 = "latin_1"
for _unused_i in xrange(nstrings):
nchars = local_unpack('> 1, charsneed)
rawstrg = data[pos:pos+2*charsavail]
# if DEBUG: print "SST U16: nchars=%d pos=%d rawstrg=%r" % (nchars, pos, rawstrg)
try:
accstrg += unicode(rawstrg, "utf_16_le")
except:
# print "SST U16: nchars=%d pos=%d rawstrg=%r" % (nchars, pos, rawstrg)
# Probable cause: dodgy data e.g. unfinished surrogate pair.
# E.g. file unicode2.xls in pyExcelerator's examples has cells containing
# unichr(i) for i in range(0x100000)
# so this will include 0xD800 etc
raise
pos += 2*charsavail
else:
# Note: this is COMPRESSED (not ASCII!) encoding!!!
charsavail = local_min(datalen - pos, charsneed)
rawstrg = data[pos:pos+charsavail]
# if DEBUG: print "SST CMPRSD: nchars=%d pos=%d rawstrg=%r" % (nchars, pos, rawstrg)
accstrg += unicode(rawstrg, latin_1)
pos += charsavail
charsgot += charsavail
if charsgot == nchars:
break
datainx += 1
data = datatab[datainx]
datalen = len(data)
options = local_BYTES_ORD(data[0])
pos = 1
if rtcount:
runs = []
for runindex in xrange(rtcount):
if pos == datalen:
pos = 0
datainx += 1
data = datatab[datainx]
datalen = len(data)
runs.append(local_unpack("= datalen:
# adjust to correct position in next record
pos = pos - datalen
datainx += 1
if datainx < ndatas:
data = datatab[datainx]
datalen = len(data)
else:
assert _unused_i == nstrings - 1
strappend(accstrg)
return strings, richtext_runs
© 2015 - 2025 Weber Informatics LLC | Privacy Policy