From 8c45165c1924448cdd130bbb39bb1131acb78e5d Mon Sep 17 00:00:00 2001 From: Andreas Zweili Date: Wed, 9 May 2018 08:20:38 +0200 Subject: [PATCH] initial push --- movie_spy/docopt.py | 581 +++++++++++++++++++++ movie_spy/exifread/__init__.py | 262 ++++++++++ movie_spy/exifread/classes.py | 560 ++++++++++++++++++++ movie_spy/exifread/exif_log.py | 76 +++ movie_spy/exifread/utils.py | 116 ++++ movie_spy/jpgSorter.py | 126 +++++ movie_spy/moviespy.py | 102 ++++ movie_spy/numberOfFilesPerFolderLimiter.py | 27 + movie_spy/recovery.py | 126 +++++ 9 files changed, 1976 insertions(+) create mode 100644 movie_spy/docopt.py create mode 100644 movie_spy/exifread/__init__.py create mode 100644 movie_spy/exifread/classes.py create mode 100644 movie_spy/exifread/exif_log.py create mode 100644 movie_spy/exifread/utils.py create mode 100644 movie_spy/jpgSorter.py create mode 100644 movie_spy/moviespy.py create mode 100644 movie_spy/numberOfFilesPerFolderLimiter.py create mode 100644 movie_spy/recovery.py diff --git a/movie_spy/docopt.py b/movie_spy/docopt.py new file mode 100644 index 0000000..7c6a52d --- /dev/null +++ b/movie_spy/docopt.py @@ -0,0 +1,581 @@ +"""Pythonic command-line interface parser that will make you smile. + + * http://docopt.org + * Repository and issue-tracker: https://github.com/docopt/docopt + * Licensed under terms of MIT license (see LICENSE-MIT) + * Copyright (c) 2013 Vladimir Keleshev, vladimir@keleshev.com + +""" +import sys +import re + + +__all__ = ['docopt'] +__version__ = '0.6.2' + + +class DocoptLanguageError(Exception): + + """Error in construction of usage-message by developer.""" + + +class DocoptExit(SystemExit): + + """Exit in case user invoked program with incorrect arguments.""" + + usage = '' + + def __init__(self, message=''): + SystemExit.__init__(self, (message + '\n' + self.usage).strip()) + + +class Pattern(object): + + def __eq__(self, other): + return repr(self) == repr(other) + + def __hash__(self): + return hash(repr(self)) + + def fix(self): + self.fix_identities() + self.fix_repeating_arguments() + return self + + def fix_identities(self, uniq=None): + """Make pattern-tree tips point to same object if they are equal.""" + if not hasattr(self, 'children'): + return self + uniq = list(set(self.flat())) if uniq is None else uniq + for i, child in enumerate(self.children): + if not hasattr(child, 'children'): + assert child in uniq + self.children[i] = uniq[uniq.index(child)] + else: + child.fix_identities(uniq) + + def fix_repeating_arguments(self): + """Fix elements that should accumulate/increment values.""" + either = [list(child.children) for child in transform(self).children] + for case in either: + for e in [child for child in case if case.count(child) > 1]: + if type(e) is Argument or type(e) is Option and e.argcount: + if e.value is None: + e.value = [] + elif type(e.value) is not list: + e.value = e.value.split() + if type(e) is Command or type(e) is Option and e.argcount == 0: + e.value = 0 + return self + + +def transform(pattern): + """Expand pattern into an (almost) equivalent one, but with single Either. + + Example: ((-a | -b) (-c | -d)) => (-a -c | -a -d | -b -c | -b -d) + Quirks: [-a] => (-a), (-a...) => (-a -a) + + """ + result = [] + groups = [[pattern]] + while groups: + children = groups.pop(0) + parents = [Required, Optional, OptionsShortcut, Either, OneOrMore] + if any(t in map(type, children) for t in parents): + child = [c for c in children if type(c) in parents][0] + children.remove(child) + if type(child) is Either: + for c in child.children: + groups.append([c] + children) + elif type(child) is OneOrMore: + groups.append(child.children * 2 + children) + else: + groups.append(child.children + children) + else: + result.append(children) + return Either(*[Required(*e) for e in result]) + + +class LeafPattern(Pattern): + + """Leaf/terminal node of a pattern tree.""" + + def __init__(self, name, value=None): + self.name, self.value = name, value + + def __repr__(self): + return '%s(%r, %r)' % (self.__class__.__name__, self.name, self.value) + + def flat(self, *types): + return [self] if not types or type(self) in types else [] + + def match(self, left, collected=None): + collected = [] if collected is None else collected + pos, match = self.single_match(left) + if match is None: + return False, left, collected + left_ = left[:pos] + left[pos + 1:] + same_name = [a for a in collected if a.name == self.name] + if type(self.value) in (int, list): + if type(self.value) is int: + increment = 1 + else: + increment = ([match.value] if type(match.value) is str + else match.value) + if not same_name: + match.value = increment + return True, left_, collected + [match] + same_name[0].value += increment + return True, left_, collected + return True, left_, collected + [match] + + +class BranchPattern(Pattern): + + """Branch/inner node of a pattern tree.""" + + def __init__(self, *children): + self.children = list(children) + + def __repr__(self): + return '%s(%s)' % (self.__class__.__name__, + ', '.join(repr(a) for a in self.children)) + + def flat(self, *types): + if type(self) in types: + return [self] + return sum([child.flat(*types) for child in self.children], []) + + +class Argument(LeafPattern): + + def single_match(self, left): + for n, pattern in enumerate(left): + if type(pattern) is Argument: + return n, Argument(self.name, pattern.value) + return None, None + + @classmethod + def parse(class_, source): + name = re.findall('(<\S*?>)', source)[0] + value = re.findall('\[default: (.*)\]', source, flags=re.I) + return class_(name, value[0] if value else None) + + +class Command(Argument): + + def __init__(self, name, value=False): + self.name, self.value = name, value + + def single_match(self, left): + for n, pattern in enumerate(left): + if type(pattern) is Argument: + if pattern.value == self.name: + return n, Command(self.name, True) + else: + break + return None, None + + +class Option(LeafPattern): + + def __init__(self, short=None, long=None, argcount=0, value=False): + assert argcount in (0, 1) + self.short, self.long, self.argcount = short, long, argcount + self.value = None if value is False and argcount else value + + @classmethod + def parse(class_, option_description): + short, long, argcount, value = None, None, 0, False + options, _, description = option_description.strip().partition(' ') + options = options.replace(',', ' ').replace('=', ' ') + for s in options.split(): + if s.startswith('--'): + long = s + elif s.startswith('-'): + short = s + else: + argcount = 1 + if argcount: + matched = re.findall('\[default: (.*)\]', description, flags=re.I) + value = matched[0] if matched else None + return class_(short, long, argcount, value) + + def single_match(self, left): + for n, pattern in enumerate(left): + if self.name == pattern.name: + return n, pattern + return None, None + + @property + def name(self): + return self.long or self.short + + def __repr__(self): + return 'Option(%r, %r, %r, %r)' % (self.short, self.long, + self.argcount, self.value) + + +class Required(BranchPattern): + + def match(self, left, collected=None): + collected = [] if collected is None else collected + l = left + c = collected + for pattern in self.children: + matched, l, c = pattern.match(l, c) + if not matched: + return False, left, collected + return True, l, c + + +class Optional(BranchPattern): + + def match(self, left, collected=None): + collected = [] if collected is None else collected + for pattern in self.children: + m, left, collected = pattern.match(left, collected) + return True, left, collected + + +class OptionsShortcut(Optional): + + """Marker/placeholder for [options] shortcut.""" + + +class OneOrMore(BranchPattern): + + def match(self, left, collected=None): + assert len(self.children) == 1 + collected = [] if collected is None else collected + l = left + c = collected + l_ = None + matched = True + times = 0 + while matched: + # could it be that something didn't match but changed l or c? + matched, l, c = self.children[0].match(l, c) + times += 1 if matched else 0 + if l_ == l: + break + l_ = l + if times >= 1: + return True, l, c + return False, left, collected + + +class Either(BranchPattern): + + def match(self, left, collected=None): + collected = [] if collected is None else collected + outcomes = [] + for pattern in self.children: + matched, _, _ = outcome = pattern.match(left, collected) + if matched: + outcomes.append(outcome) + if outcomes: + return min(outcomes, key=lambda outcome: len(outcome[1])) + return False, left, collected + + +class Tokens(list): + + def __init__(self, source, error=DocoptExit): + self += source.split() if hasattr(source, 'split') else source + self.error = error + + @staticmethod + def from_pattern(source): + source = re.sub(r'([\[\]\(\)\|]|\.\.\.)', r' \1 ', source) + source = [s for s in re.split('\s+|(\S*<.*?>)', source) if s] + return Tokens(source, error=DocoptLanguageError) + + def move(self): + return self.pop(0) if len(self) else None + + def current(self): + return self[0] if len(self) else None + + +def parse_long(tokens, options): + """long ::= '--' chars [ ( ' ' | '=' ) chars ] ;""" + long, eq, value = tokens.move().partition('=') + assert long.startswith('--') + value = None if eq == value == '' else value + similar = [o for o in options if o.long == long] + if tokens.error is DocoptExit and similar == []: # if no exact match + similar = [o for o in options if o.long and o.long.startswith(long)] + if len(similar) > 1: # might be simply specified ambiguously 2+ times? + raise tokens.error('%s is not a unique prefix: %s?' % + (long, ', '.join(o.long for o in similar))) + elif len(similar) < 1: + argcount = 1 if eq == '=' else 0 + o = Option(None, long, argcount) + options.append(o) + if tokens.error is DocoptExit: + o = Option(None, long, argcount, value if argcount else True) + else: + o = Option(similar[0].short, similar[0].long, + similar[0].argcount, similar[0].value) + if o.argcount == 0: + if value is not None: + raise tokens.error('%s must not have an argument' % o.long) + else: + if value is None: + if tokens.current() in [None, '--']: + raise tokens.error('%s requires argument' % o.long) + value = tokens.move() + if tokens.error is DocoptExit: + o.value = value if value is not None else True + return [o] + + +def parse_shorts(tokens, options): + """shorts ::= '-' ( chars )* [ [ ' ' ] chars ] ;""" + token = tokens.move() + assert token.startswith('-') and not token.startswith('--') + left = token.lstrip('-') + parsed = [] + while left != '': + short, left = '-' + left[0], left[1:] + similar = [o for o in options if o.short == short] + if len(similar) > 1: + raise tokens.error('%s is specified ambiguously %d times' % + (short, len(similar))) + elif len(similar) < 1: + o = Option(short, None, 0) + options.append(o) + if tokens.error is DocoptExit: + o = Option(short, None, 0, True) + else: # why copying is necessary here? + o = Option(short, similar[0].long, + similar[0].argcount, similar[0].value) + value = None + if o.argcount != 0: + if left == '': + if tokens.current() in [None, '--']: + raise tokens.error('%s requires argument' % short) + value = tokens.move() + else: + value = left + left = '' + if tokens.error is DocoptExit: + o.value = value if value is not None else True + parsed.append(o) + return parsed + + +def parse_pattern(source, options): + tokens = Tokens.from_pattern(source) + result = parse_expr(tokens, options) + if tokens.current() is not None: + raise tokens.error('unexpected ending: %r' % ' '.join(tokens)) + return Required(*result) + + +def parse_expr(tokens, options): + """expr ::= seq ( '|' seq )* ;""" + seq = parse_seq(tokens, options) + if tokens.current() != '|': + return seq + result = [Required(*seq)] if len(seq) > 1 else seq + while tokens.current() == '|': + tokens.move() + seq = parse_seq(tokens, options) + result += [Required(*seq)] if len(seq) > 1 else seq + return [Either(*result)] if len(result) > 1 else result + + +def parse_seq(tokens, options): + """seq ::= ( atom [ '...' ] )* ;""" + result = [] + while tokens.current() not in [None, ']', ')', '|']: + atom = parse_atom(tokens, options) + if tokens.current() == '...': + atom = [OneOrMore(*atom)] + tokens.move() + result += atom + return result + + +def parse_atom(tokens, options): + """atom ::= '(' expr ')' | '[' expr ']' | 'options' + | long | shorts | argument | command ; + """ + token = tokens.current() + result = [] + if token in '([': + tokens.move() + matching, pattern = {'(': [')', Required], '[': [']', Optional]}[token] + result = pattern(*parse_expr(tokens, options)) + if tokens.move() != matching: + raise tokens.error("unmatched '%s'" % token) + return [result] + elif token == 'options': + tokens.move() + return [OptionsShortcut()] + elif token.startswith('--') and token != '--': + return parse_long(tokens, options) + elif token.startswith('-') and token not in ('-', '--'): + return parse_shorts(tokens, options) + elif token.startswith('<') and token.endswith('>') or token.isupper(): + return [Argument(tokens.move())] + else: + return [Command(tokens.move())] + + +def parse_argv(tokens, options, options_first=False): + """Parse command-line argument vector. + + If options_first: + argv ::= [ long | shorts ]* [ argument ]* [ '--' [ argument ]* ] ; + else: + argv ::= [ long | shorts | argument ]* [ '--' [ argument ]* ] ; + + """ + parsed = [] + while tokens.current() is not None: + if tokens.current() == '--': + return parsed + [Argument(None, v) for v in tokens] + elif tokens.current().startswith('--'): + parsed += parse_long(tokens, options) + elif tokens.current().startswith('-') and tokens.current() != '-': + parsed += parse_shorts(tokens, options) + elif options_first: + return parsed + [Argument(None, v) for v in tokens] + else: + parsed.append(Argument(None, tokens.move())) + return parsed + + +def parse_defaults(doc): + defaults = [] + for s in parse_section('options:', doc): + # FIXME corner case "bla: options: --foo" + _, _, s = s.partition(':') # get rid of "options:" + split = re.split('\n[ \t]*(-\S+?)', '\n' + s)[1:] + split = [s1 + s2 for s1, s2 in zip(split[::2], split[1::2])] + options = [Option.parse(s) for s in split if s.startswith('-')] + defaults += options + return defaults + + +def parse_section(name, source): + pattern = re.compile('^([^\n]*' + name + '[^\n]*\n?(?:[ \t].*?(?:\n|$))*)', + re.IGNORECASE | re.MULTILINE) + return [s.strip() for s in pattern.findall(source)] + + +def formal_usage(section): + _, _, section = section.partition(':') # drop "usage:" + pu = section.split() + return '( ' + ' '.join(') | (' if s == pu[0] else s for s in pu[1:]) + ' )' + + +def extras(help, version, options, doc): + if help and any((o.name in ('-h', '--help')) and o.value for o in options): + print(doc.strip("\n")) + sys.exit() + if version and any(o.name == '--version' and o.value for o in options): + print(version) + sys.exit() + + +class Dict(dict): + def __repr__(self): + return '{%s}' % ',\n '.join('%r: %r' % i for i in sorted(self.items())) + + +def docopt(doc, argv=None, help=True, version=None, options_first=False): + """Parse `argv` based on command-line interface described in `doc`. + + `docopt` creates your command-line interface based on its + description that you pass as `doc`. Such description can contain + --options, , commands, which could be + [optional], (required), (mutually | exclusive) or repeated... + + Parameters + ---------- + doc : str + Description of your command-line interface. + argv : list of str, optional + Argument vector to be parsed. sys.argv[1:] is used if not + provided. + help : bool (default: True) + Set to False to disable automatic help on -h or --help + options. + version : any object + If passed, the object will be printed if --version is in + `argv`. + options_first : bool (default: False) + Set to True to require options precede positional arguments, + i.e. to forbid options and positional arguments intermix. + + Returns + ------- + args : dict + A dictionary, where keys are names of command-line elements + such as e.g. "--verbose" and "", and values are the + parsed values of those elements. + + Example + ------- + >>> from docopt import docopt + >>> doc = ''' + ... Usage: + ... my_program tcp [--timeout=] + ... my_program serial [--baud=] [--timeout=] + ... my_program (-h | --help | --version) + ... + ... Options: + ... -h, --help Show this screen and exit. + ... --baud= Baudrate [default: 9600] + ... ''' + >>> argv = ['tcp', '127.0.0.1', '80', '--timeout', '30'] + >>> docopt(doc, argv) + {'--baud': '9600', + '--help': False, + '--timeout': '30', + '--version': False, + '': '127.0.0.1', + '': '80', + 'serial': False, + 'tcp': True} + + See also + -------- + * For video introduction see http://docopt.org + * Full documentation is available in README.rst as well as online + at https://github.com/docopt/docopt#readme + + """ + argv = sys.argv[1:] if argv is None else argv + + usage_sections = parse_section('usage:', doc) + if len(usage_sections) == 0: + raise DocoptLanguageError('"usage:" (case-insensitive) not found.') + if len(usage_sections) > 1: + raise DocoptLanguageError('More than one "usage:" (case-insensitive).') + DocoptExit.usage = usage_sections[0] + + options = parse_defaults(doc) + pattern = parse_pattern(formal_usage(DocoptExit.usage), options) + # [default] syntax for argument is disabled + #for a in pattern.flat(Argument): + # same_name = [d for d in arguments if d.name == a.name] + # if same_name: + # a.value = same_name[0].value + argv = parse_argv(Tokens(argv), list(options), options_first) + pattern_options = set(pattern.flat(Option)) + for options_shortcut in pattern.flat(OptionsShortcut): + doc_options = parse_defaults(doc) + options_shortcut.children = list(set(doc_options) - pattern_options) + #if any_options: + # options_shortcut.children += [Option(o.short, o.long, o.argcount) + # for o in argv if type(o) is Option] + extras(help, version, argv, doc) + matched, left, collected = pattern.fix().match(argv) + if matched and left == []: # better error message if left? + return Dict((a.name, a.value) for a in (pattern.flat() + collected)) + raise DocoptExit() diff --git a/movie_spy/exifread/__init__.py b/movie_spy/exifread/__init__.py new file mode 100644 index 0000000..1c033b7 --- /dev/null +++ b/movie_spy/exifread/__init__.py @@ -0,0 +1,262 @@ +""" +Read Exif metadata from tiff and jpeg files. +""" + +from .exif_log import get_logger +from .classes import * +from .tags import * +from .utils import ord_ + +__version__ = '2.1.2' + +logger = get_logger() + + +def increment_base(data, base): + return ord_(data[base + 2]) * 256 + ord_(data[base + 3]) + 2 + + +def process_file(f, stop_tag=DEFAULT_STOP_TAG, details=True, strict=False, debug=False, truncate_tags=True): + """ + Process an image file (expects an open file object). + + This is the function that has to deal with all the arbitrary nasty bits + of the EXIF standard. + """ + + # by default do not fake an EXIF beginning + fake_exif = 0 + + # determine whether it's a JPEG or TIFF + data = f.read(12) + if data[0:4] in [b'II*\x00', b'MM\x00*']: + # it's a TIFF file + logger.debug("TIFF format recognized in data[0:4]") + f.seek(0) + endian = f.read(1) + f.read(1) + offset = 0 + elif data[0:2] == b'\xFF\xD8': + # it's a JPEG file + logger.debug("JPEG format recognized data[0:2]=0x%X%X", ord_(data[0]), ord_(data[1])) + base = 2 + logger.debug("data[2]=0x%X data[3]=0x%X data[6:10]=%s", + ord_(data[2]), ord_(data[3]), data[6:10]) + while ord_(data[2]) == 0xFF and data[6:10] in (b'JFIF', b'JFXX', b'OLYM', b'Phot'): + length = ord_(data[4]) * 256 + ord_(data[5]) + logger.debug(" Length offset is %s", length) + f.read(length - 8) + # fake an EXIF beginning of file + # I don't think this is used. --gd + data = b'\xFF\x00' + f.read(10) + fake_exif = 1 + if base > 2: + logger.debug(" Added to base") + base = base + length + 4 - 2 + else: + logger.debug(" Added to zero") + base = length + 4 + logger.debug(" Set segment base to 0x%X", base) + + # Big ugly patch to deal with APP2 (or other) data coming before APP1 + f.seek(0) + # in theory, this could be insufficient since 64K is the maximum size--gd + data = f.read(base + 4000) + # base = 2 + while 1: + logger.debug(" Segment base 0x%X", base) + if data[base:base + 2] == b'\xFF\xE1': + # APP1 + logger.debug(" APP1 at base 0x%X", base) + logger.debug(" Length: 0x%X 0x%X", ord_(data[base + 2]), + ord_(data[base + 3])) + logger.debug(" Code: %s", data[base + 4:base + 8]) + if data[base + 4:base + 8] == b"Exif": + logger.debug(" Decrement base by 2 to get to pre-segment header (for compatibility with later code)") + base -= 2 + break + increment = increment_base(data, base) + logger.debug(" Increment base by %s", increment) + base += increment + elif data[base:base + 2] == b'\xFF\xE0': + # APP0 + logger.debug(" APP0 at base 0x%X", base) + logger.debug(" Length: 0x%X 0x%X", ord_(data[base + 2]), + ord_(data[base + 3])) + logger.debug(" Code: %s", data[base + 4:base + 8]) + increment = increment_base(data, base) + logger.debug(" Increment base by %s", increment) + base += increment + elif data[base:base + 2] == b'\xFF\xE2': + # APP2 + logger.debug(" APP2 at base 0x%X", base) + logger.debug(" Length: 0x%X 0x%X", ord_(data[base + 2]), + ord_(data[base + 3])) + logger.debug(" Code: %s", data[base + 4:base + 8]) + increment = increment_base(data, base) + logger.debug(" Increment base by %s", increment) + base += increment + elif data[base:base + 2] == b'\xFF\xEE': + # APP14 + logger.debug(" APP14 Adobe segment at base 0x%X", base) + logger.debug(" Length: 0x%X 0x%X", ord_(data[base + 2]), + ord_(data[base + 3])) + logger.debug(" Code: %s", data[base + 4:base + 8]) + increment = increment_base(data, base) + logger.debug(" Increment base by %s", increment) + base += increment + logger.debug(" There is useful EXIF-like data here, but we have no parser for it.") + elif data[base:base + 2] == b'\xFF\xDB': + logger.debug(" JPEG image data at base 0x%X No more segments are expected.", + base) + break + elif data[base:base + 2] == b'\xFF\xD8': + # APP12 + logger.debug(" FFD8 segment at base 0x%X", base) + logger.debug(" Got 0x%X 0x%X and %s instead", + ord_(data[base]), + ord_(data[base + 1]), + data[4 + base:10 + base]) + logger.debug(" Length: 0x%X 0x%X", ord_(data[base + 2]), + ord_(data[base + 3])) + logger.debug(" Code: %s", data[base + 4:base + 8]) + increment = increment_base(data, base) + logger.debug(" Increment base by %s", increment) + base += increment + elif data[base:base + 2] == b'\xFF\xEC': + # APP12 + logger.debug(" APP12 XMP (Ducky) or Pictureinfo segment at base 0x%X", + base) + logger.debug(" Got 0x%X and 0x%X instead", ord_(data[base]), + ord_(data[base + 1])) + logger.debug(" Length: 0x%X 0x%X", + ord_(data[base + 2]), + ord_(data[base + 3])) + logger.debug("Code: %s", data[base + 4:base + 8]) + increment = increment_base(data, base) + logger.debug(" Increment base by %s", increment) + base += increment + logger.debug( + " There is useful EXIF-like data here (quality, comment, copyright), but we have no parser for it.") + else: + try: + increment = increment_base(data, base) + logger.debug(" Got 0x%X and 0x%X instead", + ord_(data[base]), + ord_(data[base + 1])) + except IndexError: + logger.debug(" Unexpected/unhandled segment type or file content.") + return {} + else: + logger.debug(" Increment base by %s", increment) + base += increment + f.seek(base + 12) + if ord_(data[2 + base]) == 0xFF and data[6 + base:10 + base] == b'Exif': + # detected EXIF header + offset = f.tell() + endian = f.read(1) + #HACK TEST: endian = 'M' + elif ord_(data[2 + base]) == 0xFF and data[6 + base:10 + base + 1] == b'Ducky': + # detected Ducky header. + logger.debug("EXIF-like header (normally 0xFF and code): 0x%X and %s", + ord_(data[2 + base]), data[6 + base:10 + base + 1]) + offset = f.tell() + endian = f.read(1) + elif ord_(data[2 + base]) == 0xFF and data[6 + base:10 + base + 1] == b'Adobe': + # detected APP14 (Adobe) + logger.debug("EXIF-like header (normally 0xFF and code): 0x%X and %s", + ord_(data[2 + base]), data[6 + base:10 + base + 1]) + offset = f.tell() + endian = f.read(1) + else: + # no EXIF information + logger.debug("No EXIF header expected data[2+base]==0xFF and data[6+base:10+base]===Exif (or Duck)") + logger.debug("Did get 0x%X and %s", + ord_(data[2 + base]), data[6 + base:10 + base + 1]) + return {} + else: + # file format not recognized + logger.debug("File format not recognized.") + return {} + + endian = chr(ord_(endian[0])) + # deal with the EXIF info we found + logger.debug("Endian format is %s (%s)", endian, { + 'I': 'Intel', + 'M': 'Motorola', + '\x01': 'Adobe Ducky', + 'd': 'XMP/Adobe unknown' + }[endian]) + + hdr = ExifHeader(f, endian, offset, fake_exif, strict, debug, details, truncate_tags) + ifd_list = hdr.list_ifd() + thumb_ifd = False + ctr = 0 + for ifd in ifd_list: + if ctr == 0: + ifd_name = 'Image' + elif ctr == 1: + ifd_name = 'Thumbnail' + thumb_ifd = ifd + else: + ifd_name = 'IFD %d' % ctr + logger.debug('IFD %d (%s) at offset %s:', ctr, ifd_name, ifd) + hdr.dump_ifd(ifd, ifd_name, stop_tag=stop_tag) + ctr += 1 + # EXIF IFD + exif_off = hdr.tags.get('Image ExifOffset') + if exif_off: + logger.debug('Exif SubIFD at offset %s:', exif_off.values[0]) + hdr.dump_ifd(exif_off.values[0], 'EXIF', stop_tag=stop_tag) + + # deal with MakerNote contained in EXIF IFD + # (Some apps use MakerNote tags but do not use a format for which we + # have a description, do not process these). + if details and 'EXIF MakerNote' in hdr.tags and 'Image Make' in hdr.tags: + hdr.decode_maker_note() + + # extract thumbnails + if details and thumb_ifd: + hdr.extract_tiff_thumbnail(thumb_ifd) + hdr.extract_jpeg_thumbnail() + + # parse XMP tags (experimental) + if debug and details: + xmp_string = b'' + # Easy we already have them + if 'Image ApplicationNotes' in hdr.tags: + logger.debug('XMP present in Exif') + xmp_string = make_string(hdr.tags['Image ApplicationNotes'].values) + # We need to look in the entire file for the XML + else: + logger.debug('XMP not in Exif, searching file for XMP info...') + xml_started = False + xml_finished = False + for line in f: + open_tag = line.find(b'') + + if open_tag != -1: + xml_started = True + line = line[open_tag:] + logger.debug('XMP found opening tag at line position %s' % open_tag) + + if close_tag != -1: + logger.debug('XMP found closing tag at line position %s' % close_tag) + line_offset = 0 + if open_tag != -1: + line_offset = open_tag + line = line[:(close_tag - line_offset) + 12] + xml_finished = True + + if xml_started: + xmp_string += line + + if xml_finished: + break + + logger.debug('XMP Finished searching for info') + if xmp_string: + hdr.parse_xmp(xmp_string) + + return hdr.tags diff --git a/movie_spy/exifread/classes.py b/movie_spy/exifread/classes.py new file mode 100644 index 0000000..6fb7f9a --- /dev/null +++ b/movie_spy/exifread/classes.py @@ -0,0 +1,560 @@ +import struct +import re + +from .exif_log import get_logger +from .utils import s2n_motorola, s2n_intel, Ratio +from .tags import * + +logger = get_logger() + +try: + basestring +except NameError: + basestring = str + +class IfdTag: + """ + Eases dealing with tags. + """ + + def __init__(self, printable, tag, field_type, values, field_offset, + field_length): + # printable version of data + self.printable = printable + # tag ID number + self.tag = tag + # field type as index into FIELD_TYPES + self.field_type = field_type + # offset of start of field in bytes from beginning of IFD + self.field_offset = field_offset + # length of data field in bytes + self.field_length = field_length + # either a string or array of data items + self.values = values + + def __str__(self): + return self.printable + + def __repr__(self): + try: + s = '(0x%04X) %s=%s @ %d' % (self.tag, + FIELD_TYPES[self.field_type][2], + self.printable, + self.field_offset) + except: + s = '(%s) %s=%s @ %s' % (str(self.tag), + FIELD_TYPES[self.field_type][2], + self.printable, + str(self.field_offset)) + return s + + +class ExifHeader: + """ + Handle an EXIF header. + """ + def __init__(self, file, endian, offset, fake_exif, strict, + debug=False, detailed=True, truncate_tags=True): + self.file = file + self.endian = endian + self.offset = offset + self.fake_exif = fake_exif + self.strict = strict + self.debug = debug + self.detailed = detailed + self.truncate_tags = truncate_tags + self.tags = {} + + def s2n(self, offset, length, signed=0): + """ + Convert slice to integer, based on sign and endian flags. + + Usually this offset is assumed to be relative to the beginning of the + start of the EXIF information. + For some cameras that use relative tags, this offset may be relative + to some other starting point. + """ + self.file.seek(self.offset + offset) + sliced = self.file.read(length) + if self.endian == 'I': + val = s2n_intel(sliced) + else: + val = s2n_motorola(sliced) + # Sign extension? + if signed: + msb = 1 << (8 * length - 1) + if val & msb: + val -= (msb << 1) + return val + + def n2s(self, offset, length): + """Convert offset to string.""" + s = '' + for dummy in range(length): + if self.endian == 'I': + s += chr(offset & 0xFF) + else: + s = chr(offset & 0xFF) + s + offset = offset >> 8 + return s + + def _first_ifd(self): + """Return first IFD.""" + return self.s2n(4, 4) + + def _next_ifd(self, ifd): + """Return the pointer to next IFD.""" + entries = self.s2n(ifd, 2) + next_ifd = self.s2n(ifd + 2 + 12 * entries, 4) + if next_ifd == ifd: + return 0 + else: + return next_ifd + + def list_ifd(self): + """Return the list of IFDs in the header.""" + i = self._first_ifd() + ifds = [] + while i: + ifds.append(i) + i = self._next_ifd(i) + return ifds + + def dump_ifd(self, ifd, ifd_name, tag_dict=EXIF_TAGS, relative=0, stop_tag=DEFAULT_STOP_TAG): + """ + Return a list of entries in the given IFD. + """ + # make sure we can process the entries + try: + entries = self.s2n(ifd, 2) + except TypeError: + logger.warning("Possibly corrupted IFD: %s" % ifd) + return + + for i in range(entries): + # entry is index of start of this IFD in the file + entry = ifd + 2 + 12 * i + tag = self.s2n(entry, 2) + + # get tag name early to avoid errors, help debug + tag_entry = tag_dict.get(tag) + if tag_entry: + tag_name = tag_entry[0] + else: + tag_name = 'Tag 0x%04X' % tag + + # ignore certain tags for faster processing + if not (not self.detailed and tag in IGNORE_TAGS): + field_type = self.s2n(entry + 2, 2) + + # unknown field type + if not 0 < field_type < len(FIELD_TYPES): + if not self.strict: + continue + else: + raise ValueError('Unknown type %d in tag 0x%04X' % (field_type, tag)) + + type_length = FIELD_TYPES[field_type][0] + count = self.s2n(entry + 4, 4) + # Adjust for tag id/type/count (2+2+4 bytes) + # Now we point at either the data or the 2nd level offset + offset = entry + 8 + + # If the value fits in 4 bytes, it is inlined, else we + # need to jump ahead again. + if count * type_length > 4: + # offset is not the value; it's a pointer to the value + # if relative we set things up so s2n will seek to the right + # place when it adds self.offset. Note that this 'relative' + # is for the Nikon type 3 makernote. Other cameras may use + # other relative offsets, which would have to be computed here + # slightly differently. + if relative: + tmp_offset = self.s2n(offset, 4) + offset = tmp_offset + ifd - 8 + if self.fake_exif: + offset += 18 + else: + offset = self.s2n(offset, 4) + + field_offset = offset + values = None + if field_type == 2: + # special case: null-terminated ASCII string + # XXX investigate + # sometimes gets too big to fit in int value + if count != 0: # and count < (2**31): # 2E31 is hardware dependant. --gd + file_position = self.offset + offset + try: + self.file.seek(file_position) + values = self.file.read(count) + + # Drop any garbage after a null. + values = values.split(b'\x00', 1)[0] + if isinstance(values, bytes): + try: + values = values.decode("utf-8") + except UnicodeDecodeError: + logger.warning("Possibly corrupted field %s in %s IFD", tag_name, ifd_name) + except OverflowError: + logger.warn('OverflowError at position: %s, length: %s', file_position, count) + values = '' + except MemoryError: + logger.warn('MemoryError at position: %s, length: %s', file_position, count) + values = '' + else: + values = '' + else: + values = [] + signed = (field_type in [6, 8, 9, 10]) + + # XXX investigate + # some entries get too big to handle could be malformed + # file or problem with self.s2n + if count < 1000: + for dummy in range(count): + if field_type in (5, 10): + # a ratio + value = Ratio(self.s2n(offset, 4, signed), + self.s2n(offset + 4, 4, signed)) + else: + value = self.s2n(offset, type_length, signed) + values.append(value) + offset = offset + type_length + # The test above causes problems with tags that are + # supposed to have long values! Fix up one important case. + elif tag_name in ('MakerNote', makernote.canon.CAMERA_INFO_TAG_NAME): + for dummy in range(count): + value = self.s2n(offset, type_length, signed) + values.append(value) + offset = offset + type_length + + # now 'values' is either a string or an array + if count == 1 and field_type != 2: + printable = str(values[0]) + elif count > 50 and len(values) > 20 and not isinstance(values, basestring) : + if self.truncate_tags : + printable = str(values[0:20])[0:-1] + ", ... ]" + else: + printable = str(values[0:-1]) + else: + try: + printable = str(values) + except UnicodeEncodeError: + printable = unicode(values) + # compute printable version of values + if tag_entry: + # optional 2nd tag element is present + if len(tag_entry) != 1: + if callable(tag_entry[1]): + # call mapping function + printable = tag_entry[1](values) + elif type(tag_entry[1]) is tuple: + ifd_info = tag_entry[1] + try: + logger.debug('%s SubIFD at offset %d:', ifd_info[0], values[0]) + self.dump_ifd(values[0], ifd_info[0], tag_dict=ifd_info[1], stop_tag=stop_tag) + except IndexError: + logger.warn('No values found for %s SubIFD', ifd_info[0]) + else: + printable = '' + for i in values: + # use lookup table for this tag + printable += tag_entry[1].get(i, repr(i)) + + self.tags[ifd_name + ' ' + tag_name] = IfdTag(printable, tag, + field_type, + values, field_offset, + count * type_length) + try: + tag_value = repr(self.tags[ifd_name + ' ' + tag_name]) + # fix for python2's handling of unicode values + except UnicodeEncodeError: + tag_value = unicode(self.tags[ifd_name + ' ' + tag_name]) + logger.debug(' %s: %s', tag_name, tag_value) + + if tag_name == stop_tag: + break + + def extract_tiff_thumbnail(self, thumb_ifd): + """ + Extract uncompressed TIFF thumbnail. + + Take advantage of the pre-existing layout in the thumbnail IFD as + much as possible + """ + thumb = self.tags.get('Thumbnail Compression') + if not thumb or thumb.printable != 'Uncompressed TIFF': + return + + entries = self.s2n(thumb_ifd, 2) + # this is header plus offset to IFD ... + if self.endian == 'M': + tiff = 'MM\x00*\x00\x00\x00\x08' + else: + tiff = 'II*\x00\x08\x00\x00\x00' + # ... plus thumbnail IFD data plus a null "next IFD" pointer + self.file.seek(self.offset + thumb_ifd) + tiff += self.file.read(entries * 12 + 2) + '\x00\x00\x00\x00' + + # fix up large value offset pointers into data area + for i in range(entries): + entry = thumb_ifd + 2 + 12 * i + tag = self.s2n(entry, 2) + field_type = self.s2n(entry + 2, 2) + type_length = FIELD_TYPES[field_type][0] + count = self.s2n(entry + 4, 4) + old_offset = self.s2n(entry + 8, 4) + # start of the 4-byte pointer area in entry + ptr = i * 12 + 18 + # remember strip offsets location + if tag == 0x0111: + strip_off = ptr + strip_len = count * type_length + # is it in the data area? + if count * type_length > 4: + # update offset pointer (nasty "strings are immutable" crap) + # should be able to say "tiff[ptr:ptr+4]=newoff" + newoff = len(tiff) + tiff = tiff[:ptr] + self.n2s(newoff, 4) + tiff[ptr + 4:] + # remember strip offsets location + if tag == 0x0111: + strip_off = newoff + strip_len = 4 + # get original data and store it + self.file.seek(self.offset + old_offset) + tiff += self.file.read(count * type_length) + + # add pixel strips and update strip offset info + old_offsets = self.tags['Thumbnail StripOffsets'].values + old_counts = self.tags['Thumbnail StripByteCounts'].values + for i in range(len(old_offsets)): + # update offset pointer (more nasty "strings are immutable" crap) + offset = self.n2s(len(tiff), strip_len) + tiff = tiff[:strip_off] + offset + tiff[strip_off + strip_len:] + strip_off += strip_len + # add pixel strip to end + self.file.seek(self.offset + old_offsets[i]) + tiff += self.file.read(old_counts[i]) + + self.tags['TIFFThumbnail'] = tiff + + def extract_jpeg_thumbnail(self): + """ + Extract JPEG thumbnail. + + (Thankfully the JPEG data is stored as a unit.) + """ + thumb_offset = self.tags.get('Thumbnail JPEGInterchangeFormat') + if thumb_offset: + self.file.seek(self.offset + thumb_offset.values[0]) + size = self.tags['Thumbnail JPEGInterchangeFormatLength'].values[0] + self.tags['JPEGThumbnail'] = self.file.read(size) + + # Sometimes in a TIFF file, a JPEG thumbnail is hidden in the MakerNote + # since it's not allowed in a uncompressed TIFF IFD + if 'JPEGThumbnail' not in self.tags: + thumb_offset = self.tags.get('MakerNote JPEGThumbnail') + if thumb_offset: + self.file.seek(self.offset + thumb_offset.values[0]) + self.tags['JPEGThumbnail'] = self.file.read(thumb_offset.field_length) + + def decode_maker_note(self): + """ + Decode all the camera-specific MakerNote formats + + Note is the data that comprises this MakerNote. + The MakerNote will likely have pointers in it that point to other + parts of the file. We'll use self.offset as the starting point for + most of those pointers, since they are relative to the beginning + of the file. + If the MakerNote is in a newer format, it may use relative addressing + within the MakerNote. In that case we'll use relative addresses for + the pointers. + As an aside: it's not just to be annoying that the manufacturers use + relative offsets. It's so that if the makernote has to be moved by the + picture software all of the offsets don't have to be adjusted. Overall, + this is probably the right strategy for makernotes, though the spec is + ambiguous. + The spec does not appear to imagine that makernotes would + follow EXIF format internally. Once they did, it's ambiguous whether + the offsets should be from the header at the start of all the EXIF info, + or from the header at the start of the makernote. + """ + note = self.tags['EXIF MakerNote'] + + # Some apps use MakerNote tags but do not use a format for which we + # have a description, so just do a raw dump for these. + make = self.tags['Image Make'].printable + + # Nikon + # The maker note usually starts with the word Nikon, followed by the + # type of the makernote (1 or 2, as a short). If the word Nikon is + # not at the start of the makernote, it's probably type 2, since some + # cameras work that way. + if 'NIKON' in make: + if note.values[0:7] == [78, 105, 107, 111, 110, 0, 1]: + logger.debug("Looks like a type 1 Nikon MakerNote.") + self.dump_ifd(note.field_offset + 8, 'MakerNote', + tag_dict=makernote.nikon.TAGS_OLD) + elif note.values[0:7] == [78, 105, 107, 111, 110, 0, 2]: + logger.debug("Looks like a labeled type 2 Nikon MakerNote") + if note.values[12:14] != [0, 42] and note.values[12:14] != [42, 0]: + raise ValueError("Missing marker tag '42' in MakerNote.") + # skip the Makernote label and the TIFF header + self.dump_ifd(note.field_offset + 10 + 8, 'MakerNote', + tag_dict=makernote.nikon.TAGS_NEW, relative=1) + else: + # E99x or D1 + logger.debug("Looks like an unlabeled type 2 Nikon MakerNote") + self.dump_ifd(note.field_offset, 'MakerNote', + tag_dict=makernote.nikon.TAGS_NEW) + return + + # Olympus + if make.startswith('OLYMPUS'): + self.dump_ifd(note.field_offset + 8, 'MakerNote', + tag_dict=makernote.olympus.TAGS) + # TODO + #for i in (('MakerNote Tag 0x2020', makernote.OLYMPUS_TAG_0x2020),): + # self.decode_olympus_tag(self.tags[i[0]].values, i[1]) + #return + + # Casio + if 'CASIO' in make or 'Casio' in make: + self.dump_ifd(note.field_offset, 'MakerNote', + tag_dict=makernote.casio.TAGS) + return + + # Fujifilm + if make == 'FUJIFILM': + # bug: everything else is "Motorola" endian, but the MakerNote + # is "Intel" endian + endian = self.endian + self.endian = 'I' + # bug: IFD offsets are from beginning of MakerNote, not + # beginning of file header + offset = self.offset + self.offset += note.field_offset + # process note with bogus values (note is actually at offset 12) + self.dump_ifd(12, 'MakerNote', tag_dict=makernote.fujifilm.TAGS) + # reset to correct values + self.endian = endian + self.offset = offset + return + + # Apple + if make == 'Apple' and \ + note.values[0:10] == [65, 112, 112, 108, 101, 32, 105, 79, 83, 0]: + t = self.offset + self.offset += note.field_offset+14 + self.dump_ifd(0, 'MakerNote', + tag_dict=makernote.apple.TAGS) + self.offset = t + return + + # Canon + if make == 'Canon': + self.dump_ifd(note.field_offset, 'MakerNote', + tag_dict=makernote.canon.TAGS) + + for i in (('MakerNote Tag 0x0001', makernote.canon.CAMERA_SETTINGS), + ('MakerNote Tag 0x0002', makernote.canon.FOCAL_LENGTH), + ('MakerNote Tag 0x0004', makernote.canon.SHOT_INFO), + ('MakerNote Tag 0x0026', makernote.canon.AF_INFO_2), + ('MakerNote Tag 0x0093', makernote.canon.FILE_INFO)): + if i[0] in self.tags: + logger.debug('Canon ' + i[0]) + self._canon_decode_tag(self.tags[i[0]].values, i[1]) + del self.tags[i[0]] + if makernote.canon.CAMERA_INFO_TAG_NAME in self.tags: + tag = self.tags[makernote.canon.CAMERA_INFO_TAG_NAME] + logger.debug('Canon CameraInfo') + self._canon_decode_camera_info(tag) + del self.tags[makernote.canon.CAMERA_INFO_TAG_NAME] + return + + def _olympus_decode_tag(self, value, mn_tags): + """ TODO Decode Olympus MakerNote tag based on offset within tag.""" + pass + + def _canon_decode_tag(self, value, mn_tags): + """ + Decode Canon MakerNote tag based on offset within tag. + + See http://www.burren.cx/david/canon.html by David Burren + """ + for i in range(1, len(value)): + tag = mn_tags.get(i, ('Unknown', )) + name = tag[0] + if len(tag) > 1: + val = tag[1].get(value[i], 'Unknown') + else: + val = value[i] + try: + logger.debug(" %s %s %s", i, name, hex(value[i])) + except TypeError: + logger.debug(" %s %s %s", i, name, value[i]) + + # it's not a real IFD Tag but we fake one to make everybody + # happy. this will have a "proprietary" type + self.tags['MakerNote ' + name] = IfdTag(str(val), None, 0, None, + None, None) + + def _canon_decode_camera_info(self, camera_info_tag): + """ + Decode the variable length encoded camera info section. + """ + model = self.tags.get('Image Model', None) + if not model: + return + model = str(model.values) + + camera_info_tags = None + for (model_name_re, tag_desc) in makernote.canon.CAMERA_INFO_MODEL_MAP.items(): + if re.search(model_name_re, model): + camera_info_tags = tag_desc + break + else: + return + + # We are assuming here that these are all unsigned bytes (Byte or + # Unknown) + if camera_info_tag.field_type not in (1, 7): + return + camera_info = struct.pack('<%dB' % len(camera_info_tag.values), + *camera_info_tag.values) + + # Look for each data value and decode it appropriately. + for offset, tag in camera_info_tags.items(): + tag_format = tag[1] + tag_size = struct.calcsize(tag_format) + if len(camera_info) < offset + tag_size: + continue + packed_tag_value = camera_info[offset:offset + tag_size] + tag_value = struct.unpack(tag_format, packed_tag_value)[0] + + tag_name = tag[0] + if len(tag) > 2: + if callable(tag[2]): + tag_value = tag[2](tag_value) + else: + tag_value = tag[2].get(tag_value, tag_value) + logger.debug(" %s %s", tag_name, tag_value) + + self.tags['MakerNote ' + tag_name] = IfdTag(str(tag_value), None, + 0, None, None, None) + + def parse_xmp(self, xmp_string): + import xml.dom.minidom + + logger.debug('XMP cleaning data') + + xml = xml.dom.minidom.parseString(xmp_string) + pretty = xml.toprettyxml() + cleaned = [] + for line in pretty.splitlines(): + if line.strip(): + cleaned.append(line) + self.tags['Image ApplicationNotes'] = IfdTag('\n'.join(cleaned), None, + 1, None, None, None) diff --git a/movie_spy/exifread/exif_log.py b/movie_spy/exifread/exif_log.py new file mode 100644 index 0000000..82d5aea --- /dev/null +++ b/movie_spy/exifread/exif_log.py @@ -0,0 +1,76 @@ +""" +Custom log output +""" + +import sys +import logging + +TEXT_NORMAL = 0 +TEXT_BOLD = 1 +TEXT_RED = 31 +TEXT_GREEN = 32 +TEXT_YELLOW = 33 +TEXT_BLUE = 34 +TEXT_MAGENTA = 35 +TEXT_CYAN = 36 + + +def get_logger(): + return logging.getLogger('exifread') + + +def setup_logger(debug, color): + """Configure the logger.""" + if debug: + log_level = logging.DEBUG + else: + log_level = logging.INFO + + logger = logging.getLogger('exifread') + stream = Handler(log_level, debug, color) + logger.addHandler(stream) + logger.setLevel(log_level) + + +class Formatter(logging.Formatter): + + def __init__(self, debug=False, color=False): + self.color = color + self.debug = debug + if self.debug: + log_format = '%(levelname)-6s %(message)s' + else: + log_format = '%(message)s' + logging.Formatter.__init__(self, log_format) + + def format(self, record): + if self.debug and self.color: + if record.levelno >= logging.CRITICAL: + color = TEXT_RED + elif record.levelno >= logging.ERROR: + color = TEXT_RED + elif record.levelno >= logging.WARNING: + color = TEXT_YELLOW + elif record.levelno >= logging.INFO: + color = TEXT_GREEN + elif record.levelno >= logging.DEBUG: + color = TEXT_CYAN + else: + color = TEXT_NORMAL + record.levelname = "\x1b[%sm%s\x1b[%sm" % (color, record.levelname, TEXT_NORMAL) + return logging.Formatter.format(self, record) + + +class Handler(logging.StreamHandler): + + def __init__(self, log_level, debug=False, color=False): + self.color = color + self.debug = debug + logging.StreamHandler.__init__(self, sys.stdout) + self.setFormatter(Formatter(debug, color)) + self.setLevel(log_level) +""" + def emit(self, record): + record.msg = "\x1b[%sm%s\x1b[%sm" % (TEXT_BOLD, record.msg, TEXT_NORMAL) + logging.StreamHandler.emit(self, record) +""" \ No newline at end of file diff --git a/movie_spy/exifread/utils.py b/movie_spy/exifread/utils.py new file mode 100644 index 0000000..7f5da63 --- /dev/null +++ b/movie_spy/exifread/utils.py @@ -0,0 +1,116 @@ +""" +Misc utilities. +""" + +from fractions import Fraction + + +def ord_(dta): + if isinstance(dta, str): + return ord(dta) + return dta + + +def make_string(seq): + """ + Don't throw an exception when given an out of range character. + """ + string = '' + for c in seq: + # Screen out non-printing characters + try: + if 32 <= c and c < 256: + string += chr(c) + except TypeError: + pass + # If no printing chars + if not string: + return str(seq) + return string + + +def make_string_uc(seq): + """ + Special version to deal with the code in the first 8 bytes of a user comment. + First 8 bytes gives coding system e.g. ASCII vs. JIS vs Unicode. + """ + seq = seq[8:] + # Of course, this is only correct if ASCII, and the standard explicitly + # allows JIS and Unicode. + return make_string(seq) + + +def s2n_motorola(string): + """Extract multi-byte integer in Motorola format (little endian).""" + x = 0 + for c in string: + x = (x << 8) | ord_(c) + return x + + +def s2n_intel(string): + """Extract multi-byte integer in Intel format (big endian).""" + x = 0 + y = 0 + for c in string: + x = x | (ord_(c) << y) + y += + 8 + return x + +def get_gps_coords(tags): + + lng_ref_tag_name = "GPS GPSLongitudeRef" + lng_tag_name = "GPS GPSLongitude" + lat_ref_tag_name = "GPS GPSLatitudeRef" + lat_tag_name = "GPS GPSLatitude" + + # Check if these tags are present + gps_tags = [lng_ref_tag_name,lng_tag_name,lat_tag_name,lat_tag_name] + for tag in gps_tags: + if not tag in tags.keys(): + return None + + lng_ref_val = tags[lng_ref_tag_name].values + lng_coord_val = [c.decimal() for c in tags[lng_tag_name].values] + + lat_ref_val = tags[lat_ref_tag_name].values + lat_coord_val = [c.decimal() for c in tags[lat_tag_name].values] + + lng_coord = sum([c/60**i for i,c in enumerate(lng_coord_val)]) + lng_coord *= (-1)**(lng_ref_val=="W") + + lat_coord = sum([c/60**i for i,c in enumerate(lat_coord_val)]) + lat_coord *= (-1)**(lat_ref_val=="S") + + return (lat_coord, lng_coord) + +class Ratio(Fraction): + """ + Ratio object that eventually will be able to reduce itself to lowest + common denominator for printing. + """ + + # We're immutable, so use __new__ not __init__ + def __new__(cls, numerator=0, denominator=None): + try: + self = super(Ratio, cls).__new__(cls, numerator, denominator) + except ZeroDivisionError: + self = super(Ratio, cls).__new__(cls) + self._numerator = numerator + self._denominator = denominator + return self + __new__.doc = Fraction.__new__.__doc__ + + def __repr__(self): + return str(self) + + @property + def num(self): + return self.numerator + + @property + def den(self): + return self.denominator + + def decimal(self): + return float(self) diff --git a/movie_spy/jpgSorter.py b/movie_spy/jpgSorter.py new file mode 100644 index 0000000..679dd05 --- /dev/null +++ b/movie_spy/jpgSorter.py @@ -0,0 +1,126 @@ +import os.path +import ntpath +from time import localtime, strftime, strptime, mktime +import shutil +import exifread + +unknownDateFolderName = "date-unknown" + +def getMinimumCreationTime(exif_data): + creationTime = None + dateTime = exif_data.get('DateTime') + dateTimeOriginal = exif_data.get('EXIF DateTimeOriginal') + dateTimeDigitized = exif_data.get('EXIF DateTimeDigitized') + + # 3 differnt time fields that can be set independently result in 9 if-cases + if (dateTime is None): + if (dateTimeOriginal is None): + # case 1/9: dateTime, dateTimeOriginal, and dateTimeDigitized = None + # case 2/9: dateTime and dateTimeOriginal = None, then use dateTimeDigitized + creationTime = dateTimeDigitized + else: + # case 3/9: dateTime and dateTimeDigitized = None, then use dateTimeOriginal + # case 4/9: dateTime = None, prefere dateTimeOriginal over dateTimeDigitized + creationTime = dateTimeOriginal + else: + # case 5-9: when creationTime is set, prefere it over the others + creationTime = dateTime + + return creationTime + +def postprocessImage(images, imageDirectory, fileName): + imagePath = os.path.join(imageDirectory, fileName) + image = open(imagePath, 'rb') + creationTime = None + try: + exifTags = exifread.process_file(image, details=False) + creationTime = getMinimumCreationTime(exifTags) + except: + print("invalid exif tags for " + fileName) + + # distinct different time types + if creationTime is None: + creationTime = localtime(os.path.getctime(imagePath)) + else: + try: + creationTime = strptime(str(creationTime), "%Y:%m:%d %H:%M:%S") + except: + creationTime = localtime(os.path.getctime(imagePath)) + + images.append((mktime(creationTime), imagePath)) + image.close() + +# Creates the requested path recursively. +def createPath(newPath): + if not os.path.exists(newPath): + os.makedirs(newPath) + +# Pass None for month to create 'year/eventNumber' directories instead of 'year/month/eventNumber'. +def createNewFolder(destinationRoot, year, month, eventNumber): + if month is not None: + newPath = os.path.join(destinationRoot, year, month, str(eventNumber)) + else: + newPath = os.path.join(destinationRoot, year, str(eventNumber)) + + createPath(newPath) + +def createUnknownDateFolder(destinationRoot): + path = os.path.join(destinationRoot, unknownDateFolderName) + createPath(path) + +def writeImages(images, destinationRoot, minEventDeltaDays, splitByMonth=False): + minEventDelta = minEventDeltaDays * 60 * 60 * 24 # convert in seconds + sortedImages = sorted(images) + previousTime = None + eventNumber = 0 + previousDestination = None + today = strftime("%d/%m/%Y") + + for imageTuple in sortedImages: + destination = "" + destinationFilePath = "" + t = localtime(imageTuple[0]) + year = strftime("%Y", t) + month = splitByMonth and strftime("%m", t) or None + creationDate = strftime("%d/%m/%Y", t) + fileName = ntpath.basename(imageTuple[1]) + + if(creationDate == today): + createUnknownDateFolder(destinationRoot) + destination = os.path.join(destinationRoot, unknownDateFolderName) + destinationFilePath = os.path.join(destination, fileName) + + else: + if (previousTime == None) or ((previousTime + minEventDelta) < imageTuple[0]): + eventNumber = eventNumber + 1 + createNewFolder(destinationRoot, year, month, eventNumber) + + previousTime = imageTuple[0] + + destComponents = [destinationRoot, year, month, str(eventNumber)] + destComponents = [v for v in destComponents if v is not None] + destination = os.path.join(*destComponents) + + # it may be possible that an event covers 2 years. + # in such a case put all the images to the event in the old year + if not (os.path.exists(destination)): + destination = previousDestination + # destination = os.path.join(destinationRoot, str(int(year) - 1), str(eventNumber)) + + previousDestination = destination + destinationFilePath = os.path.join(destination, fileName) + + if not (os.path.exists(destinationFilePath)): + shutil.move(imageTuple[1], destination) + else: + if (os.path.exists(imageTuple[1])): + os.remove(imageTuple[1]) + + +def postprocessImages(imageDirectory, minEventDeltaDays, splitByMonth): + images = [] + for root, dirs, files in os.walk(imageDirectory): + for file in files: + postprocessImage(images, imageDirectory, file) + + writeImages(images, imageDirectory, minEventDeltaDays, splitByMonth) diff --git a/movie_spy/moviespy.py b/movie_spy/moviespy.py new file mode 100644 index 0000000..0edb79b --- /dev/null +++ b/movie_spy/moviespy.py @@ -0,0 +1,102 @@ +#!/usr/bin/python3 + + +""" +Movie Spy Is a programm to quickly collect all personal information +from a computer. Ideally run from a live USB stick. The name comes +from the fact that in movies spies can just quickly insert a USB key +and get all the important data. + +Usage: + moviespy + moviespy (-h | --help) + moviespy --version + +Options: + -h --help Show this screen. + --version Show version. + +""" + +import os +import fnmatch +from shutil import copy2 + +from docopt import docopt +import jpgSorter +import numberOfFilesPerFolderLimiter as max_files_per_folder + + +file_types = { + 'movies': ['mp4', 'mkv'], + 'documents': ['docx', 'xlsx', 'txt', 'doc', 'xls', 'pdf', 'odt', 'ods'], + 'pictures': ['jpg', 'png', 'gif'], + 'keys': ['key', 'kdbx', 'kdb', 'gpg'] +} +maxNumberOfFilesPerFolder = 500 +splitMonths = True +minEventDeltaDays = 4 + + +def find(pattern, path): + result = {} + for root, dirs, files in os.walk(path): + for name in files: + if fnmatch.fnmatch(name, pattern): + result[os.path.join(root, name)] = name + return result + + +def find_extensions(extensions, path): + path_dictionary = {} + + for extension in extensions: + extension_wildcard = "*." + extension + files = find(extension_wildcard, path) + path_dictionary[extension] = files + return path_dictionary + + +def copy_files(extensions, search_path, dest): + path_dictionary = find_extensions(extensions, search_path) + + for file_type, files_by_type in path_dictionary.items(): + if files_by_type: + path = os.path.join(dest, file_type) + if not os.path.exists(path): + os.makedirs(path) + + for file in files_by_type.items(): + if not os.path.islink(file): + try: + copy2(file, path) + except Exception as e: + print(e) + continue + + +def sort_jpgs(location): + jpgSorter.postprocessImages(location, + minEventDeltaDays, + splitMonths) + + +if __name__ == '__main__': + arguments = docopt(__doc__, version='Movie Spy v1.0') + source = arguments[''] + destination = arguments[''] + + for file_type, extensions in file_types.items(): + type_destination = os.path.join(arguments[''], file_type) + if not os.path.exists(type_destination): + os.makedirs(type_destination) + copy_files(extensions, source, type_destination) + if 'jpg' in extensions: + sort_jpgs(os.path.join(destination, type_destination, "jpg")) + + max_files_per_folder.limitFilesPerFolder(destination, + maxNumberOfFilesPerFolder) + + + + diff --git a/movie_spy/numberOfFilesPerFolderLimiter.py b/movie_spy/numberOfFilesPerFolderLimiter.py new file mode 100644 index 0000000..fc60d2a --- /dev/null +++ b/movie_spy/numberOfFilesPerFolderLimiter.py @@ -0,0 +1,27 @@ +import sys +import math +import os +import shutil + + +def limitFilesPerFolder(folder, maxNumberOfFilesPerFolder): + for root, dirs, files in os.walk(folder, topdown=False): + for dir in dirs: + dirPath = os.path.join(root, dir) + filesInFolder = len(os.listdir(dirPath)) + if(filesInFolder > maxNumberOfFilesPerFolder): + numberOfSubfolders = ((filesInFolder - 1) // maxNumberOfFilesPerFolder) + 1 + for subFolderNumber in range(1, numberOfSubfolders+1): + subFolderPath = os.path.join(dirPath, str(subFolderNumber)) + if not os.path.exists(subFolderPath): + os.mkdir(subFolderPath) + fileCounter = 1 + for file in os.listdir(dirPath): + source = os.path.join(dirPath, file) + if os.path.isfile(source): + destDir = str(((fileCounter - 1) // maxNumberOfFilesPerFolder) + 1) + destination = os.path.join(dirPath, destDir, file) + shutil.move(source, destination) + fileCounter += 1 + + diff --git a/movie_spy/recovery.py b/movie_spy/recovery.py new file mode 100644 index 0000000..e4f1afc --- /dev/null +++ b/movie_spy/recovery.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python +import os +import os.path +from time import localtime, strftime +import shutil +import jpgSorter +import numberOfFilesPerFolderLimiter + + +def getNumberOfFilesInFolderRecursively(start_path='.'): + numberOfFiles = 0 + for dirpath, dirnames, filenames in os.walk(start_path): + for f in filenames: + fp = os.path.join(dirpath, f) + if(os.path.isfile(fp)): + numberOfFiles += 1 + return numberOfFiles + + +def getNumberOfFilesInFolder(path): + return len(os.listdir(path)) + + +def log(logString): + print(strftime("%H:%M:%S", localtime()) + ": " + logString) + + +def moveFile(file, destination): + extension = os.path.splitext(file)[1][1:].upper() + sourcePath = os.path.join(root, file) + + destinationDirectory = os.path.join(destination, extension) + + if not os.path.exists(destinationDirectory): + os.mkdir(destinationDirectory) + + fileName = str(fileCounter) + "." + extension.lower() + destinationFile = os.path.join(destinationDirectory, fileName) + if not os.path.exists(destinationFile): + shutil.copy(sourcePath, destinationFile) + + +def get_args(): + import argparse + + description = ( + "Sort files recoverd by Photorec.\n" + "The input files are first copied to the destination, sorted by file type.\n" + "Then JPG files are sorted based on creation year (and optionally month).\n" + "Finally any directories containing more than a maximum number of files are accordingly split into separate directories." + ) + + parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('source', metavar='src', type=str, help='source directory with files recovered by Photorec') + parser.add_argument('destination', metavar='dest', type=str, help='destination directory to write sorted files to') + parser.add_argument('-n', '--max-per-dir', type=int, default=500, required=False, help='maximum number of files per directory') + parser.add_argument('-m', '--split-months', action='store_true', required=False, help='split JPEG files not only by year but by month as well') + parser.add_argument('-k', '--keep_filename', action='store_true', required=False, help='keeps the original filenames when copying') + parser.add_argument('-d', '--min-event-delta', type=int, default=4, required=False, help='minimum delta in days between two days') + + return parser.parse_args() + + + +maxNumberOfFilesPerFolder = 500 +splitMonths = False +source = None +destination = None +keepFilename = False + + +args = get_args() +source = args.source +destination = args.destination +maxNumberOfFilesPerFolder = args.max_per_dir +splitMonths = args.split_months +keepFilename = args.keep_filename +minEventDeltaDays = args.min_event_delta + +print("Reading from source '%s', writing to destination '%s' (max %i files per directory, splitting by year %s)." % + (source, destination, maxNumberOfFilesPerFolder, splitMonths and "and month" or "only")) +if keepFilename: + print("I will keep you filenames as they are") +else: + print("I will rename your files like '1.jpg'") + +while ((source is None) or (not os.path.exists(source))): + source = input('Enter a valid source directory\n') +while ((destination is None) or (not os.path.exists(destination))): + destination = input('Enter a valid destination directory\n') + +fileNumber = getNumberOfFilesInFolderRecursively(source) +onePercentFiles = int(fileNumber/100) +totalAmountToCopy = str(fileNumber) +print("Files to copy: " + totalAmountToCopy) + + +fileCounter = 0 +for root, dirs, files in os.walk(source, topdown=False): + + for file in files: + extension = os.path.splitext(file)[1][1:].upper() + sourcePath = os.path.join(root, file) + + destinationDirectory = os.path.join(destination, extension) + + if not os.path.exists(destinationDirectory): + os.mkdir(destinationDirectory) + if keepFilename: + fileName = file + else: + fileName = str(fileCounter) + "." + extension.lower() + + destinationFile = os.path.join(destinationDirectory, fileName) + if not os.path.exists(destinationFile): + shutil.copy2(sourcePath, destinationFile) + + fileCounter += 1 + if((fileCounter % onePercentFiles) is 0): + log(str(fileCounter) + " / " + totalAmountToCopy + " processed.") + +log("start special file treatment") +jpgSorter.postprocessImages(os.path.join(destination, "JPG"), minEventDeltaDays, splitMonths) + +log("assure max file per folder number") +numberOfFilesPerFolderLimiter.limitFilesPerFolder(destination, maxNumberOfFilesPerFolder)