From 6147820602d99afa5862b7fced44fb47ae74bd17 Mon Sep 17 00:00:00 2001 From: Joseph Spiros Date: Sat, 9 Apr 2011 07:28:52 -0400 Subject: [PATCH] Implemented basic EBMLFile class for parsing files. Needs improvement, as it parses the entire file into memory at once. --- ebml/__init__.py | 2 + ebml/core.py | 69 ++++++++++++++++++++++++++- ebml/files.py | 114 ++++++++++++++++++++++++++++++++++++++++++++ ebml/schema/base.py | 10 ---- 4 files changed, 183 insertions(+), 12 deletions(-) create mode 100644 ebml/files.py diff --git a/ebml/__init__.py b/ebml/__init__.py index e69de29..cd5b84b 100644 --- a/ebml/__init__.py +++ b/ebml/__init__.py @@ -0,0 +1,2 @@ +from .files import * +from .exceptions import * \ No newline at end of file diff --git a/ebml/core.py b/ebml/core.py index 2e0702c..6e8d338 100644 --- a/ebml/core.py +++ b/ebml/core.py @@ -1,4 +1,5 @@ -import warnings +import struct +import datetime from .exceptions import * @@ -124,4 +125,68 @@ def read_element_id(stream, max_width=EBMLMaxIDLength): elif min_bytes == vint_len: raise ReservedElementIDError('All value bits set to 0') - return value, vint_len \ No newline at end of file + return value, vint_len + + +def read_int(stream, size): + value = 0 + if size > 0: + byte = ord(stream.read(1)) + if (byte & 0b10000000) == 0b10000000: + value = -1 << 8 + value |= byte + for i in range(1, size): + byte = ord(stream.read(1)) + value = (value << 1) | byte + return value + + +def read_uint(stream, size): + value = 0 + for i in range(0, size): + byte = ord(stream.read(1)) + value = (value << 8) | byte + return value + + +def read_float(stream, size): + if size not in (0, 4, 8): + # http://www.matroska.org/technical/specs/rfc/index.html allows for 10-byte floats. + # http://www.matroska.org/technical/specs/index.html specifies 4-byte and 8-byte only. + # I'm following the latter due to it being more up-to-date than the former, and because it's easier to implement. + raise ValueError('floats must be 0, 4, or 8 bytes long') + value = 0.0 + if size in (4, 8): + data = stream.read(size) + value = struct.unpack({ + 4: '>f', + 8: '>d' + }[size], data)[0] + return value + + +def read_string(stream, size): + value = '' + if size > 0: + value = stream.read(size) + return value + + +def read_unicode(stream, size): + value = u'' + if size > 0: + data = stream.read(size) + value = unicode(data, 'utf_8') + return value + + +def read_date(stream): + size = 8 # date is always an 8-byte signed integer + data = stream.read(size) + nanoseconds = struct.unpack('>q', data)[0] + delta = datetime.timedelta(microseconds=(nanoseconds // 1000)) + return datetime.datetime(2001, 1, 1) + delta + + +def read_binary(stream, size): + return stream.read(size) \ No newline at end of file diff --git a/ebml/files.py b/ebml/files.py new file mode 100644 index 0000000..c8cc983 --- /dev/null +++ b/ebml/files.py @@ -0,0 +1,114 @@ +from .schema import * +from .core import read_element_id, read_element_size, read_int, read_uint, read_float, read_string, read_unicode, read_date + + +__all__ = ('EBMLFile', 'MatroskaFile') + + +TYPE_READERS = { + INT: read_int, + UINT: read_uint, + FLOAT: read_float, + STRING: read_string, + UNICODE: read_unicode, + DATE: lambda stream, size: read_date(stream) +} + + +class EBMLFileElement(object): + def __init__(self, stream, schema, parent=None): + self.stream = stream + self.schema = schema + self.parent = parent + self.class_id, self.class_id_len = read_element_id(self.stream) + try: + self.element = schema.element_with_class_id(self.class_id) + except: + self.element = None + else: + if self.parent is None: + if not self.element in self.schema.root_elements(): + self.element = None + else: + if not self.element in self.schema.child_elements_of_element(self.parent): + self.element = None + self.size, self.size_len = read_element_size(self.stream) + self.offset = self.stream.tell() + self._read_contents() + + def _read_contents(self): + contents = None + if self.element is not None: + if self.element.data_type in TYPE_READERS: + contents = TYPE_READERS[self.element.data_type](self.stream, self.size) + elif self.element.data_type == CONTAINER: + read_len = 0 + contents = [] + while self.size > read_len: + sub_el = EBMLFileElement(self.stream, self.schema, self.element) + read_len += (sub_el.class_id_len + sub_el.size_len + sub_el.size) + contents.append(sub_el) + else: + self.stream.seek(self.offset + self.size, 0) + else: + self.stream.seek(self.offset + self.size, 0) + self.contents = contents + + def pprint(self, indent=0): + sargs = { + 'class_name': self.element.class_name or 'Unknown', + 'class_id': self.class_id, + 'size': self.size, + 'value': self.contents or None + } + def pprint_(foo): + print ('\t' * indent) + foo + if not self.contents: + pprint_('<%(class_name)s id=\'%(class_id)x\' size=\'%(size)i\' />' % sargs) + else: + if self.element.data_type == CONTAINER: + pprint_('<%(class_name)s id=\'%(class_id)x\' size=\'%(size)i\'>' % sargs) + for sub_el in self.contents: + sub_el.pprint(indent + 1) + pprint_('' % sargs) + else: + pprint_('<%(class_name)s id=\'%(class_id)x\' size=\'%(size)i\'>%(value)s' % sargs) + + def __repr__(self): + return '<%(class_name)s id=%(class_id)x size=%(size)i>' % { + 'class_name': self.element.class_name or '?', + 'class_id': self.element.class_id or self.class_id, + 'size': self.size + } + + +class EBMLFile(object): + default_schema = EBML + + def __init__(self, name_or_stream, schema=None): + if schema is None: + schema = self.default_schema + self.schema = schema + + if isinstance(name_or_stream, basestring): + self.stream = open(name_or_stream, 'rb') + else: + self.stream = name_or_stream + + self._read_contents() + + def _read_contents(self): + self.contents = [] + while True: + try: + self.contents.append(EBMLFileElement(self.stream, self.schema, None)) + except: + break + + def pprint(self): + for el in self.contents: + el.pprint() + + +class MatroskaFile(EBMLFile): + default_schema = Matroska \ No newline at end of file diff --git a/ebml/schema/base.py b/ebml/schema/base.py index 2686732..8c4ecdf 100644 --- a/ebml/schema/base.py +++ b/ebml/schema/base.py @@ -4,16 +4,6 @@ __all__ = ('INT', 'UINT', 'FLOAT', 'STRING', 'UNICODE', 'DATE', 'BINARY', 'CONTA INT, UINT, FLOAT, STRING, UNICODE, DATE, BINARY, CONTAINER = range(0, 8) -SCHEMA_TYPES = { - 'int': INT, - 'uint': UINT, - 'float': FLOAT, - 'string': STRING, - 'date': DATE, - 'binary': BINARY -} - - class Element(object): class_id = None class_name = 'Unknown' -- 2.20.1