Commit 112ac3c7 authored by Dustyn Gibson's avatar Dustyn Gibson
Browse files

Update enzyme to work with subliminal

parent fae8a12d
# -*- coding: utf-8 -*-
# enzyme - Video metadata parser
# Copyright 2011-2012 Antoine Bertin <[email protected]>
# Copyright 2003-2006 Thomas Schueppel <[email protected]>
# Copyright 2003-2006 Dirk Meyer <[email protected]>
#
# This file is part of enzyme.
#
# enzyme is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# enzyme is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with enzyme. If not, see <http://www.gnu.org/licenses/>.
import mimetypes
import os
import sys
from exceptions import *
__title__ = 'enzyme'
__version__ = '0.4.2'
__author__ = 'Antoine Bertin'
__license__ = 'Apache 2.0'
__copyright__ = 'Copyright 2013 Antoine Bertin'
import logging
from .exceptions import *
from .mkv import *
PARSERS = [('asf', ['video/asf'], ['asf', 'wmv', 'wma']),
('flv', ['video/flv'], ['flv']),
('mkv', ['video/x-matroska', 'application/mkv'], ['mkv', 'mka', 'webm']),
('mp4', ['video/quicktime', 'video/mp4'], ['mov', 'qt', 'mp4', 'mp4a', 'm4v', '3gp', '3gp2', '3g2', 'mk2']),
('mpeg', ['video/mpeg'], ['mpeg', 'mpg', 'mp4', 'ts']),
('ogm', ['application/ogg'], ['ogm', 'ogg', 'ogv']),
('real', ['video/real'], ['rm', 'ra', 'ram']),
('riff', ['video/avi'], ['wav', 'avi'])
]
def parse(path):
"""Parse metadata of the given video
:param string path: path to the video file to parse
:return: a parser corresponding to the video's mimetype or extension
:rtype: :class:`~enzyme.core.AVContainer`
"""
if not os.path.isfile(path):
raise ValueError('Invalid path')
extension = os.path.splitext(path)[1][1:]
mimetype = mimetypes.guess_type(path)[0]
parser_ext = None
parser_mime = None
for (parser_name, parser_mimetypes, parser_extensions) in PARSERS:
if mimetype in parser_mimetypes:
parser_mime = parser_name
if extension in parser_extensions:
parser_ext = parser_name
parser = parser_mime or parser_ext
if not parser:
raise NoParserError()
mod = __import__(parser, globals=globals(), locals=locals(), fromlist=[], level=-1)
with open(path, 'rb') as f:
p = mod.Parser(f)
return p
logging.getLogger(__name__).addHandler(logging.NullHandler())
# -*- coding: utf-8 -*-
# enzyme - Video metadata parser
# Copyright 2011-2012 Antoine Bertin <[email protected]>
# Copyright 2003-2006 Thomas Schueppel <[email protected]>
# Copyright 2003-2006 Dirk Meyer <[email protected]>
#
# This file is part of enzyme.
#
# enzyme is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# enzyme is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with enzyme. If not, see <http://www.gnu.org/licenses/>.
from exceptions import ParseError
import core
import logging
import string
import struct
__all__ = ['Parser']
# get logging object
log = logging.getLogger(__name__)
def _guid(input):
# Remove any '-'
s = string.join(string.split(input, '-'), '')
r = ''
if len(s) != 32:
return ''
for i in range(0, 16):
r += chr(int(s[2 * i:2 * i + 2], 16))
guid = struct.unpack('>IHHBB6s', r)
return guid
GUIDS = {
'ASF_Header_Object' : _guid('75B22630-668E-11CF-A6D9-00AA0062CE6C'),
'ASF_Data_Object' : _guid('75B22636-668E-11CF-A6D9-00AA0062CE6C'),
'ASF_Simple_Index_Object' : _guid('33000890-E5B1-11CF-89F4-00A0C90349CB'),
'ASF_Index_Object' : _guid('D6E229D3-35DA-11D1-9034-00A0C90349BE'),
'ASF_Media_Object_Index_Object' : _guid('FEB103F8-12AD-4C64-840F-2A1D2F7AD48C'),
'ASF_Timecode_Index_Object' : _guid('3CB73FD0-0C4A-4803-953D-EDF7B6228F0C'),
'ASF_File_Properties_Object' : _guid('8CABDCA1-A947-11CF-8EE4-00C00C205365'),
'ASF_Stream_Properties_Object' : _guid('B7DC0791-A9B7-11CF-8EE6-00C00C205365'),
'ASF_Header_Extension_Object' : _guid('5FBF03B5-A92E-11CF-8EE3-00C00C205365'),
'ASF_Codec_List_Object' : _guid('86D15240-311D-11D0-A3A4-00A0C90348F6'),
'ASF_Script_Command_Object' : _guid('1EFB1A30-0B62-11D0-A39B-00A0C90348F6'),
'ASF_Marker_Object' : _guid('F487CD01-A951-11CF-8EE6-00C00C205365'),
'ASF_Bitrate_Mutual_Exclusion_Object' : _guid('D6E229DC-35DA-11D1-9034-00A0C90349BE'),
'ASF_Error_Correction_Object' : _guid('75B22635-668E-11CF-A6D9-00AA0062CE6C'),
'ASF_Content_Description_Object' : _guid('75B22633-668E-11CF-A6D9-00AA0062CE6C'),
'ASF_Extended_Content_Description_Object' : _guid('D2D0A440-E307-11D2-97F0-00A0C95EA850'),
'ASF_Content_Branding_Object' : _guid('2211B3FA-BD23-11D2-B4B7-00A0C955FC6E'),
'ASF_Stream_Bitrate_Properties_Object' : _guid('7BF875CE-468D-11D1-8D82-006097C9A2B2'),
'ASF_Content_Encryption_Object' : _guid('2211B3FB-BD23-11D2-B4B7-00A0C955FC6E'),
'ASF_Extended_Content_Encryption_Object' : _guid('298AE614-2622-4C17-B935-DAE07EE9289C'),
'ASF_Alt_Extended_Content_Encryption_Obj' : _guid('FF889EF1-ADEE-40DA-9E71-98704BB928CE'),
'ASF_Digital_Signature_Object' : _guid('2211B3FC-BD23-11D2-B4B7-00A0C955FC6E'),
'ASF_Padding_Object' : _guid('1806D474-CADF-4509-A4BA-9AABCB96AAE8'),
'ASF_Extended_Stream_Properties_Object' : _guid('14E6A5CB-C672-4332-8399-A96952065B5A'),
'ASF_Advanced_Mutual_Exclusion_Object' : _guid('A08649CF-4775-4670-8A16-6E35357566CD'),
'ASF_Group_Mutual_Exclusion_Object' : _guid('D1465A40-5A79-4338-B71B-E36B8FD6C249'),
'ASF_Stream_Prioritization_Object' : _guid('D4FED15B-88D3-454F-81F0-ED5C45999E24'),
'ASF_Bandwidth_Sharing_Object' : _guid('A69609E6-517B-11D2-B6AF-00C04FD908E9'),
'ASF_Language_List_Object' : _guid('7C4346A9-EFE0-4BFC-B229-393EDE415C85'),
'ASF_Metadata_Object' : _guid('C5F8CBEA-5BAF-4877-8467-AA8C44FA4CCA'),
'ASF_Metadata_Library_Object' : _guid('44231C94-9498-49D1-A141-1D134E457054'),
'ASF_Index_Parameters_Object' : _guid('D6E229DF-35DA-11D1-9034-00A0C90349BE'),
'ASF_Media_Object_Index_Parameters_Obj' : _guid('6B203BAD-3F11-4E84-ACA8-D7613DE2CFA7'),
'ASF_Timecode_Index_Parameters_Object' : _guid('F55E496D-9797-4B5D-8C8B-604DFE9BFB24'),
'ASF_Audio_Media' : _guid('F8699E40-5B4D-11CF-A8FD-00805F5C442B'),
'ASF_Video_Media' : _guid('BC19EFC0-5B4D-11CF-A8FD-00805F5C442B'),
'ASF_Command_Media' : _guid('59DACFC0-59E6-11D0-A3AC-00A0C90348F6'),
'ASF_JFIF_Media' : _guid('B61BE100-5B4E-11CF-A8FD-00805F5C442B'),
'ASF_Degradable_JPEG_Media' : _guid('35907DE0-E415-11CF-A917-00805F5C442B'),
'ASF_File_Transfer_Media' : _guid('91BD222C-F21C-497A-8B6D-5AA86BFC0185'),
'ASF_Binary_Media' : _guid('3AFB65E2-47EF-40F2-AC2C-70A90D71D343'),
'ASF_Web_Stream_Media_Subtype' : _guid('776257D4-C627-41CB-8F81-7AC7FF1C40CC'),
'ASF_Web_Stream_Format' : _guid('DA1E6B13-8359-4050-B398-388E965BF00C'),
'ASF_No_Error_Correction' : _guid('20FB5700-5B55-11CF-A8FD-00805F5C442B'),
'ASF_Audio_Spread' : _guid('BFC3CD50-618F-11CF-8BB2-00AA00B4E220')}
class Asf(core.AVContainer):
"""
ASF video parser. The ASF format is also used for Microsft Windows
Media files like wmv.
"""
def __init__(self, file):
core.AVContainer.__init__(self)
self.mime = 'video/x-ms-asf'
self.type = 'asf format'
self._languages = []
self._extinfo = {}
h = file.read(30)
if len(h) < 30:
raise ParseError()
(guidstr, objsize, objnum, reserved1, \
reserved2) = struct.unpack('<16sQIBB', h)
guid = self._parseguid(guidstr)
if (guid != GUIDS['ASF_Header_Object']):
raise ParseError()
if reserved1 != 0x01 or reserved2 != 0x02:
raise ParseError()
log.debug(u'Header size: %d / %d objects' % (objsize, objnum))
header = file.read(objsize - 30)
for _ in range(0, objnum):
h = self._getnextheader(header)
header = header[h[1]:]
del self._languages
del self._extinfo
def _findstream(self, id):
for stream in self.video + self.audio:
if stream.id == id:
return stream
def _apply_extinfo(self, streamid):
stream = self._findstream(streamid)
if not stream or streamid not in self._extinfo:
return
stream.bitrate, stream.fps, langid, metadata = self._extinfo[streamid]
if langid is not None and langid >= 0 and langid < len(self._languages):
stream.language = self._languages[langid]
if metadata:
stream._appendtable('ASFMETADATA', metadata)
def _parseguid(self, string):
return struct.unpack('<IHHBB6s', string[:16])
def _parsekv(self, s):
pos = 0
(descriptorlen,) = struct.unpack('<H', s[pos:pos + 2])
pos += 2
descriptorname = s[pos:pos + descriptorlen]
pos += descriptorlen
descriptortype, valuelen = struct.unpack('<HH', s[pos:pos + 4])
pos += 4
descriptorvalue = s[pos:pos + valuelen]
pos += valuelen
value = None
if descriptortype == 0x0000:
# Unicode string
value = descriptorvalue
elif descriptortype == 0x0001:
# Byte Array
value = descriptorvalue
elif descriptortype == 0x0002:
# Bool (?)
value = struct.unpack('<I', descriptorvalue)[0] != 0
elif descriptortype == 0x0003:
# DWORD
value = struct.unpack('<I', descriptorvalue)[0]
elif descriptortype == 0x0004:
# QWORD
value = struct.unpack('<Q', descriptorvalue)[0]
elif descriptortype == 0x0005:
# WORD
value = struct.unpack('<H', descriptorvalue)[0]
else:
log.debug(u'Unknown Descriptor Type %d' % descriptortype)
return (pos, descriptorname, value)
def _parsekv2(self, s):
pos = 0
strno, descriptorlen, descriptortype, valuelen = struct.unpack('<2xHHHI', s[pos:pos + 12])
pos += 12
descriptorname = s[pos:pos + descriptorlen]
pos += descriptorlen
descriptorvalue = s[pos:pos + valuelen]
pos += valuelen
value = None
if descriptortype == 0x0000:
# Unicode string
value = descriptorvalue
elif descriptortype == 0x0001:
# Byte Array
value = descriptorvalue
elif descriptortype == 0x0002:
# Bool
value = struct.unpack('<H', descriptorvalue)[0] != 0
pass
elif descriptortype == 0x0003:
# DWORD
value = struct.unpack('<I', descriptorvalue)[0]
elif descriptortype == 0x0004:
# QWORD
value = struct.unpack('<Q', descriptorvalue)[0]
elif descriptortype == 0x0005:
# WORD
value = struct.unpack('<H', descriptorvalue)[0]
else:
log.debug(u'Unknown Descriptor Type %d' % descriptortype)
return (pos, descriptorname, value, strno)
def _getnextheader(self, s):
r = struct.unpack('<16sQ', s[:24])
(guidstr, objsize) = r
guid = self._parseguid(guidstr)
if guid == GUIDS['ASF_File_Properties_Object']:
log.debug(u'File Properties Object')
val = struct.unpack('<16s6Q4I', s[24:24 + 80])
(fileid, size, date, packetcount, duration, \
senddur, preroll, flags, minpack, maxpack, maxbr) = \
val
# FIXME: parse date to timestamp
self.length = duration / 10000000.0
elif guid == GUIDS['ASF_Stream_Properties_Object']:
log.debug(u'Stream Properties Object [%d]' % objsize)
streamtype = self._parseguid(s[24:40])
errortype = self._parseguid(s[40:56])
offset, typelen, errorlen, flags = struct.unpack('<QIIH', s[56:74])
strno = flags & 0x7f
encrypted = flags >> 15
if encrypted:
self._set('encrypted', True)
if streamtype == GUIDS['ASF_Video_Media']:
vi = core.VideoStream()
vi.width, vi.height, depth, codec, = struct.unpack('<4xII2xH4s', s[89:89 + 20])
vi.codec = codec
vi.id = strno
self.video.append(vi)
elif streamtype == GUIDS['ASF_Audio_Media']:
ai = core.AudioStream()
twocc, ai.channels, ai.samplerate, bitrate, block, \
ai.samplebits, = struct.unpack('<HHIIHH', s[78:78 + 16])
ai.bitrate = 8 * bitrate
ai.codec = twocc
ai.id = strno
self.audio.append(ai)
self._apply_extinfo(strno)
elif guid == GUIDS['ASF_Extended_Stream_Properties_Object']:
streamid, langid, frametime = struct.unpack('<HHQ', s[72:84])
(bitrate,) = struct.unpack('<I', s[40:40 + 4])
if streamid not in self._extinfo:
self._extinfo[streamid] = [None, None, None, {}]
if frametime == 0:
# Problaby VFR, report as 1000fps (which is what MPlayer does)
frametime = 10000.0
self._extinfo[streamid][:3] = [bitrate, 10000000.0 / frametime, langid]
self._apply_extinfo(streamid)
elif guid == GUIDS['ASF_Header_Extension_Object']:
log.debug(u'ASF_Header_Extension_Object %d' % objsize)
size = struct.unpack('<I', s[42:46])[0]
data = s[46:46 + size]
while len(data):
log.debug(u'Sub:')
h = self._getnextheader(data)
data = data[h[1]:]
elif guid == GUIDS['ASF_Codec_List_Object']:
log.debug(u'List Object')
pass
elif guid == GUIDS['ASF_Error_Correction_Object']:
log.debug(u'Error Correction')
pass
elif guid == GUIDS['ASF_Content_Description_Object']:
log.debug(u'Content Description Object')
val = struct.unpack('<5H', s[24:24 + 10])
pos = 34
strings = []
for i in val:
ss = s[pos:pos + i].replace('\0', '').lstrip().rstrip()
strings.append(ss)
pos += i
# Set empty strings to None
strings = [x or None for x in strings]
self.title, self.artist, self.copyright, self.caption, rating = strings
elif guid == GUIDS['ASF_Extended_Content_Description_Object']:
(count,) = struct.unpack('<H', s[24:26])
pos = 26
descriptor = {}
for i in range(0, count):
# Read additional content descriptors
d = self._parsekv(s[pos:])
pos += d[0]
descriptor[d[1]] = d[2]
self._appendtable('ASFDESCRIPTOR', descriptor)
elif guid == GUIDS['ASF_Metadata_Object']:
(count,) = struct.unpack('<H', s[24:26])
pos = 26
streams = {}
for i in range(0, count):
# Read additional content descriptors
size, key, value, strno = self._parsekv2(s[pos:])
if strno not in streams:
streams[strno] = {}
streams[strno][key] = value
pos += size
for strno, metadata in streams.items():
if strno not in self._extinfo:
self._extinfo[strno] = [None, None, None, {}]
self._extinfo[strno][3].update(metadata)
self._apply_extinfo(strno)
elif guid == GUIDS['ASF_Language_List_Object']:
count = struct.unpack('<H', s[24:26])[0]
pos = 26
for i in range(0, count):
idlen = struct.unpack('<B', s[pos:pos + 1])[0]
idstring = s[pos + 1:pos + 1 + idlen]
idstring = unicode(idstring, 'utf-16').replace('\0', '')
log.debug(u'Language: %d/%d: %r' % (i + 1, count, idstring))
self._languages.append(idstring)
pos += 1 + idlen
elif guid == GUIDS['ASF_Stream_Bitrate_Properties_Object']:
# This record contains stream bitrate with payload overhead. For
# audio streams, we should have the average bitrate from
# ASF_Stream_Properties_Object. For video streams, we get it from
# ASF_Extended_Stream_Properties_Object. So this record is not
# used.
pass
elif guid == GUIDS['ASF_Content_Encryption_Object'] or \
guid == GUIDS['ASF_Extended_Content_Encryption_Object']:
self._set('encrypted', True)
else:
# Just print the type:
for h in GUIDS.keys():
if GUIDS[h] == guid:
log.debug(u'Unparsed %r [%d]' % (h, objsize))
break
else:
u = "%.8X-%.4X-%.4X-%.2X%.2X-%s" % guid
log.debug(u'unknown: len=%d [%d]' % (len(u), objsize))
return r
class AsfAudio(core.AudioStream):
"""
ASF audio parser for wma files.
"""
def __init__(self):
core.AudioStream.__init__(self)
self.mime = 'audio/x-ms-asf'
self.type = 'asf format'
def Parser(file):
"""
Wrapper around audio and av content.
"""
asf = Asf(file)
if not len(asf.audio) or len(asf.video):
# AV container
return asf
# No video but audio streams. Handle has audio core
audio = AsfAudio()
for key in audio._keys:
if key in asf._keys:
if not getattr(audio, key, None):
setattr(audio, key, getattr(asf, key))
return audio
# -*- coding: utf-8 -*-
import sys
_ver = sys.version_info
is_py3 = _ver[0] == 3
is_py2 = _ver[0] == 2
if is_py2:
bytes = lambda x: chr(x[0]) # @ReservedAssignment
elif is_py3:
bytes = bytes # @ReservedAssignment
# -*- coding: utf-8 -*-
# enzyme - Video metadata parser
# Copyright 2011-2012 Antoine Bertin <[email protected]>
# Copyright 2003-2006 Thomas Schueppel <[email protected]>
# Copyright 2003-2006 Dirk Meyer <[email protected]>
#
# This file is part of enzyme.
#
# enzyme is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# enzyme is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with enzyme. If not, see <http://www.gnu.org/licenses/>.
import re
import logging
import fourcc
import language
from strutils import str_to_unicode, unicode_to_str
UNPRINTABLE_KEYS = ['thumbnail', 'url', 'codec_private']
MEDIACORE = ['title', 'caption', 'comment', 'size', 'type', 'subtype', 'timestamp',
'keywords', 'country', 'language', 'langcode', 'url', 'artist',
'mime', 'datetime', 'tags', 'hash']
AUDIOCORE = ['channels', 'samplerate', 'length', 'encoder', 'codec', 'format',
'samplebits', 'bitrate', 'fourcc', 'trackno', 'id', 'userdate',
'enabled', 'default', 'codec_private']
MUSICCORE = ['trackof', 'album', 'genre', 'discs', 'thumbnail']
VIDEOCORE = ['length', 'encoder', 'bitrate', 'samplerate', 'codec', 'format',
'samplebits', 'width', 'height', 'fps', 'aspect', 'trackno',
'fourcc', 'id', 'enabled', 'default', 'codec_private']
AVCORE = ['length', 'encoder', 'trackno', 'trackof', 'copyright', 'product',
'genre', 'writer', 'producer', 'studio', 'rating', 'actors', 'thumbnail',
'delay', 'image', 'video', 'audio', 'subtitles', 'chapters', 'software',
'summary', 'synopsis', 'season', 'episode', 'series']
# get logging object
log = logging.getLogger(__name__)
class Media(object):
"""
Media is the base class to all Media Metadata Containers. It defines
the basic structures that handle metadata. Media and its derivates
contain a common set of metadata attributes that is listed in keys.
Specific derivates contain additional keys to the dublin core set that is
defined in Media.
"""
media = None
_keys = MEDIACORE
table_mapping = {}
def __init__(self, hash=None):
if hash is not None:
# create Media based on dict
for key, value in hash.items():
if isinstance(value, list) and value and isinstance(value[0], dict):
value = [Media(x) for x in value]
self._set(key, value)
return
self._keys = self._keys[:]
self.tables = {}
# Tags, unlike tables, are more well-defined dicts whose values are
# either Tag objects, other dicts (for nested tags), or lists of either
# (for multiple instances of the tag, e.g. actor). Where possible,
# parsers should transform tag names to conform to the Official
# Matroska tags defined at http://www.matroska.org/technical/specs/tagging/index.html
# All tag names will be lower-cased.
self.tags = Tags()
for key in set(self._keys) - set(['media', 'tags']):
setattr(self, key, None)
#
# unicode and string convertion for debugging
#
#TODO: Fix that mess
def __unicode__(self):
result = u''
# print normal attributes
lists = []
for key in self._keys:
value = getattr(self, key, None)
if value == None or key == 'url':
continue
if isinstance(value, list):
if not value:
continue
elif isinstance(value[0], basestring):
# Just a list of strings (keywords?), so don't treat it specially.
value = u', '.join(value)
else:
lists.append((key, value))
continue
elif isinstance(value, dict):
# Tables or tags treated separately.
continue
if key in UNPRINTABLE_KEYS:
value = '<unprintable data, size=%d>' % len(value)
result += u'| %10s: %s\n' % (unicode(key), unicode(value))
# print tags (recursively, to support nested tags).
def print_tags(tags, suffix, show_label):
result = ''
for n, (name, tag) in enumerate(tags.items()):
result += u'| %12s%s%s = ' % (u'tags: ' if n == 0 and show_label else '', suffix, name)
if isinstance(tag, list):
# TODO: doesn't support lists/dicts within lists.
result += u'%s\n' % ', '.join(subtag.value for subtag in tag)
else:
result += u'%s\n' % (tag.value or '')
if isinstance(tag, dict):
result += print_tags(tag, ' ', False)
return result
result += print_tags(self.tags, '', True)
# print lists
for key, l in lists:
for n, item in enumerate(l):
label = '+-- ' + key.rstrip('s').capitalize()
if key not in ['tracks', 'subtitles', 'chapters']:
label += ' Track'
result += u'%s #%d\n' % (label, n + 1)