Source code for wolframclient.deserializers.wxf.wxfparser

from __future__ import absolute_import, print_function, unicode_literals

from wolframclient.exception import WolframParserException
from wolframclient.serializers.wxfencoder import constants
from wolframclient.serializers.wxfencoder.serializer import (
    WXF_HEADER_COMPRESS,
    WXF_HEADER_SEPARATOR,
    WXF_VERSION,
    SerializationContext,
)
from wolframclient.serializers.wxfencoder.streaming import ExactSizeReader, ZipCompressedReader
from wolframclient.utils import six


[docs]class WXFParser(object):
    """Parse a WXF input.

    This class is initialized with a WXF input, and exposes a generator of
    :class:`~wolframclient.deserializers.wxf.wxfparser.WXFToken`.
    The input `wxf_input` can be a string of bytes with the serialized expression, a string of unicodes
    in which case it is considered as a filename, a object implementing a `read` method.

    The generator outputs WXF tokens one by one::

        with open('/tmp/data.wxf', 'rb') as fp:
            parser = WXFParser(fp)
            gen = parser.tokens()
            print(next(gen))

    This low level class is providing intermediary objects to ease the parsing of WXF. Most of
    the time one should directly use high level interface such as
    :func:`~wolframclient.deserializers.wxf.wxfparser.binary_deserialize`.

    The token generator is generally consumed by an instance of
    :class:`~wolframclient.deserializers.wxf.wxfconsumer.WXFConsumer`.
    """

    _mapping = {
        constants.WXF_CONSTANTS.Symbol: "token_for_string",
        constants.WXF_CONSTANTS.String: "token_for_string",
        constants.WXF_CONSTANTS.BigInteger: "token_for_string",
        constants.WXF_CONSTANTS.BigReal: "token_for_string",
        constants.WXF_CONSTANTS.Function: "token_for_function",
        constants.WXF_CONSTANTS.BinaryString: "token_for_binary_string",
        constants.WXF_CONSTANTS.Integer8: "token_for_integer8",
        constants.WXF_CONSTANTS.Integer16: "token_for_integer16",
        constants.WXF_CONSTANTS.Integer32: "token_for_integer32",
        constants.WXF_CONSTANTS.Integer64: "token_for_integer64",
        constants.WXF_CONSTANTS.Real64: "token_for_real64",
        constants.WXF_CONSTANTS.PackedArray: "token_for_packed_array",
        constants.WXF_CONSTANTS.NumericArray: "token_for_numeric_array",
        constants.WXF_CONSTANTS.Association: "token_for_association",
        constants.WXF_CONSTANTS.Rule: "token_for_rule",
        constants.WXF_CONSTANTS.RuleDelayed: "token_for_rule",
    }

    def __init__(self, wxf_input):
        """WXF parser returning Python object from a WXF encoded byte sequence.
        """
        self.context = SerializationContext()
        if isinstance(wxf_input, (six.binary_type, six.buffer_types)):
            self.reader = six.BytesIO(wxf_input)
        elif hasattr(wxf_input, "read"):
            self.reader = wxf_input
        else:
            raise TypeError(
                "Class %s neither implements a read method nor is a binary type."
                % wxf_input.__class__.__name__
            )
        version, compress = self.parse_header()
        if compress == True:
            self.reader = ZipCompressedReader(self.reader)
        else:
            self.reader = ExactSizeReader(self.reader)

[docs]    def tokens(self):
        """Generate instances :class:`~wolframclient.deserializers.wxf.wxfparser.WXFToken` from a WXF input."""
        yield self.next_token()
        while not self.context.is_valid_final_state():
            yield self.next_token()

[docs]    def parse_header(self):
        compress = False
        next_byte = self.reader.read(1)
        if next_byte == WXF_VERSION:
            version = int(next_byte)
            next_byte = self.reader.read(1)
        else:
            raise WolframParserException("Invalid version %s." % next_byte)
        if next_byte == WXF_HEADER_COMPRESS:
            compress = True
            next_byte = self.reader.read(1)
        if next_byte != WXF_HEADER_SEPARATOR:
            raise WolframParserException(
                "Invalid header. Failed to find header separator ':'."
            )
        return (version, compress)

[docs]    def parse_array(self, token):
        # Parsing array rank and dimensions
        rank = parse_varint(self.reader)
        if rank == 0:
            raise WolframParserException("Array rank cannot be zero.")
        token.dimensions = []
        for i in range(rank):
            dim = parse_varint(self.reader)
            if dim == 0:
                raise WolframParserException("Array dimensions cannot be zero.")
            token.dimensions.append(dim)
        # reading values
        bytecount = constants.ARRAY_TYPES_ELEM_SIZE[token.array_type] * token.element_count
        token.data = self.reader.read(bytecount)

[docs]    def token_for_string(self, token):
        self.context.add_part()
        token.length = parse_varint(self.reader)
        if token.length == 0:
            token.data = ""
        else:
            token.data = self.reader.read(token.length).decode("utf8")

        return token

[docs]    def token_for_integer8(self, token):
        self.context.add_part()
        token.data = constants.StructInt8LE.unpack(self.reader.read(1))[0]
        return token

[docs]    def token_for_integer16(self, token):
        self.context.add_part()
        token.data = constants.StructInt16LE.unpack(self.reader.read(2))[0]
        return token

[docs]    def token_for_integer32(self, token):
        self.context.add_part()
        token.data = constants.StructInt32LE.unpack(self.reader.read(4))[0]
        return token

[docs]    def token_for_integer64(self, token):
        self.context.add_part()
        token.data = constants.StructInt64LE.unpack(self.reader.read(8))[0]
        return token

[docs]    def token_for_real64(self, token):
        self.context.add_part()
        token.data = constants.StructDouble.unpack(self.reader.read(8))[0]
        return token

[docs]    def token_for_function(self, token):
        token.length = parse_varint(self.reader)
        self.context.step_into_new_function(token.length)
        return token

[docs]    def token_for_association(self, token):
        token.length = parse_varint(self.reader)
        self.context.step_into_new_assoc(token.length)
        return token

[docs]    def token_for_rule(self, token):
        if not self.context.is_rule_valid():
            raise WolframParserException(
                "Rule and RuleDelayed must be parts of an Association."
            )
        self.context.step_into_new_rule()
        return token

[docs]    def token_for_packed_array(self, token):
        self.context.add_part()
        token.array_type = self.reader.read(1)
        if token.array_type not in constants.VALID_PACKED_ARRAY_TYPES:
            raise WolframParserException(
                "Invalid PackedArray value type: %s" % token.array_type
            )
        self.parse_array(token)
        return token

[docs]    def token_for_numeric_array(self, token):
        self.context.add_part()
        token.array_type = self.reader.read(1)
        if token.array_type not in constants.ARRAY_TYPES_ELEM_SIZE:
            raise WolframParserException(
                "Invalid NumericArray value type: %s" % token.array_type
            )
        self.parse_array(token)
        return token

[docs]    def token_for_binary_string(self, token):
        self.context.add_part()
        token.length = parse_varint(self.reader)
        if token.length == 0:
            token.data = b""
        else:
            token.data = self.reader.read(token.length)
        return token

[docs]    def next_token(self):
        next_byte = self.reader.read(1)

        try:
            handler = self._mapping[next_byte]
        except KeyError:
            raise WolframParserException("Unexpected token %s" % next_byte)

        return getattr(self, handler)(WXFToken(next_byte))


[docs]class WXFToken(object):
    """Represent a WXF element, often referred as WXF tokens.
    """

    __slots__ = "wxf_type", "array_type", "length", "_dimensions", "_element_count", "data"

    def __init__(self, wxf_type):
        self.wxf_type = wxf_type
        self._dimensions = None
        self._element_count = None
        self.data = None
        self.length = None

    @property
    def element_count(self):
        if self._element_count is None and self._dimensions is not None:
            self._update_element_count()
        return self._element_count

    @property
    def dimensions(self):
        return self._dimensions

    @dimensions.setter
    def dimensions(self, value):
        if not isinstance(value, list):
            raise TypeError("Dimensions must be a list of positive integers.")
        self._dimensions = value
        if self._element_count is not None:
            self._update_element_count()

    def _update_element_count(self):
        count = 1
        for dim in self._dimensions:
            count = count * dim
        if not isinstance(count, six.integer_types) or count <= 0:
            raise TypeError("Dimensions must be strictly positive integers.")
        self._element_count = count

    def __str__(self):
        if self.length is not None:
            return "WXFToken<%s, data=%s, len=%i>" % (self.wxf_type, self.data, self.length)
        else:
            return "WXFToken<%s, data=%s>" % (self.wxf_type, self.data)


[docs]def parse_varint(reader):
    """Parse a readable binary buffer for a positive varint encoded integer."""
    count = 0
    continuation = True
    shift = 0
    length = 0
    # when we read from stream we get a sequence of bytes. Its length is 1
    # except if we reached EOF in which case taking index 0 raises IndexError.
    try:
        while continuation and count < 8:
            count += 1
            next_byte = reader.read(1)
            next_byte = ord(next_byte)
            length |= (next_byte & 0x7F) << shift
            shift = shift + 7
            continuation = (next_byte & 0x80) != 0

        if continuation:
            next_byte = reader.read(1)
            next_byte = ord(next_byte)
            next_byte &= 0x7F
            if next_byte == 0:
                raise WolframParserException("Invalid last varint byte.")
            length |= next_byte << shift

        return length
    except IndexError:
        raise EOFError("EOF reached while parsing varint encoded integer.")