diff --git a/ineptpdf.pyw b/ineptpdf.pyw new file mode 100644 index 0000000..1599ad7 --- /dev/null +++ b/ineptpdf.pyw @@ -0,0 +1,1569 @@ +#! /usr/bin/python + +# ineptpdf.pyw, version 1 + +# To run this program install Python 2.6 from http://www.python.org/download/ +# and PyCrypto from http://www.voidspace.org.uk/python/modules.shtml#pycrypto +# (make sure to install the version for Python 2.6). Save this script file as +# ineptpdf.pyw and double-click on it to run it. + +# Revision history: +# 1 - Initial release + +""" +Decrypt Adobe ADEPT-encrypted PDF files. +""" + +from __future__ import with_statement + +__license__ = 'GPL v3' + +import sys +import os +import re +import zlib +import struct +import hashlib +from itertools import chain +import xml.etree.ElementTree as etree +import Tkinter +import Tkconstants +import tkFileDialog +import tkMessageBox + +try: + from Crypto.Cipher import ARC4 + from Crypto.PublicKey import RSA +except ImportError: + ARC4 = None + RSA = None + + +class ADEPTError(Exception): + pass + + +### +### ASN.1 parsing code from tlslite + +def bytesToNumber(bytes): + total = 0L + for byte in bytes: + total = (total << 8) + byte + return total + +class ASN1Error(Exception): + pass + +class ASN1Parser(object): + class Parser(object): + def __init__(self, bytes): + self.bytes = bytes + self.index = 0 + + def get(self, length): + if self.index + length > len(self.bytes): + raise ASN1Error("Error decoding ASN.1") + x = 0 + for count in range(length): + x <<= 8 + x |= self.bytes[self.index] + self.index += 1 + return x + + def getFixBytes(self, lengthBytes): + bytes = self.bytes[self.index : self.index+lengthBytes] + self.index += lengthBytes + return bytes + + def getVarBytes(self, lengthLength): + lengthBytes = self.get(lengthLength) + return self.getFixBytes(lengthBytes) + + def getFixList(self, length, lengthList): + l = [0] * lengthList + for x in range(lengthList): + l[x] = self.get(length) + return l + + def getVarList(self, length, lengthLength): + lengthList = self.get(lengthLength) + if lengthList % length != 0: + raise ASN1Error("Error decoding ASN.1") + lengthList = int(lengthList/length) + l = [0] * lengthList + for x in range(lengthList): + l[x] = self.get(length) + return l + + def startLengthCheck(self, lengthLength): + self.lengthCheck = self.get(lengthLength) + self.indexCheck = self.index + + def setLengthCheck(self, length): + self.lengthCheck = length + self.indexCheck = self.index + + def stopLengthCheck(self): + if (self.index - self.indexCheck) != self.lengthCheck: + raise ASN1Error("Error decoding ASN.1") + + def atLengthCheck(self): + if (self.index - self.indexCheck) < self.lengthCheck: + return False + elif (self.index - self.indexCheck) == self.lengthCheck: + return True + else: + raise ASN1Error("Error decoding ASN.1") + + def __init__(self, bytes): + p = self.Parser(bytes) + p.get(1) + self.length = self._getASN1Length(p) + self.value = p.getFixBytes(self.length) + + def getChild(self, which): + p = self.Parser(self.value) + for x in range(which+1): + markIndex = p.index + p.get(1) + length = self._getASN1Length(p) + p.getFixBytes(length) + return ASN1Parser(p.bytes[markIndex:p.index]) + + def _getASN1Length(self, p): + firstLength = p.get(1) + if firstLength<=127: + return firstLength + else: + lengthLength = firstLength & 0x7F + return p.get(lengthLength) + +### +### PDF parsing routines from pdfminer, with changes for EBX_HANDLER + +## Utilities +## +def choplist(n, seq): + '''Groups every n elements of the list.''' + r = [] + for x in seq: + r.append(x) + if len(r) == n: + yield tuple(r) + r = [] + return + +def nunpack(s, default=0): + '''Unpacks up to 4 bytes big endian.''' + l = len(s) + if not l: + return default + elif l == 1: + return ord(s) + elif l == 2: + return unpack('>H', s)[0] + elif l == 3: + return unpack('>L', '\x00'+s)[0] + elif l == 4: + return unpack('>L', s)[0] + else: + return TypeError('invalid length: %d' % l) + + +STRICT = 0 + + +## PS Exceptions +## +class PSException(Exception): pass +class PSEOF(PSException): pass +class PSSyntaxError(PSException): pass +class PSTypeError(PSException): pass +class PSValueError(PSException): pass + + +## Basic PostScript Types +## + +# PSLiteral +class PSObject(object): pass + +class PSLiteral(PSObject): + ''' + PS literals (e.g. "/Name"). + Caution: Never create these objects directly. + Use PSLiteralTable.intern() instead. + ''' + def __init__(self, name): + self.name = name + return + + def __repr__(self): + name = [] + for char in self.name: + if not char.isalnum(): + char = '#%02x' % ord(char) + name.append(char) + return '/%s' % ''.join(name) + +# PSKeyword +class PSKeyword(PSObject): + ''' + PS keywords (e.g. "showpage"). + Caution: Never create these objects directly. + Use PSKeywordTable.intern() instead. + ''' + def __init__(self, name): + self.name = name + return + + def __repr__(self): + return self.name + +# PSSymbolTable +class PSSymbolTable(object): + + ''' + Symbol table that stores PSLiteral or PSKeyword. + ''' + + def __init__(self, classe): + self.dic = {} + self.classe = classe + return + + def intern(self, name): + if name in self.dic: + lit = self.dic[name] + else: + lit = self.classe(name) + self.dic[name] = lit + return lit + +PSLiteralTable = PSSymbolTable(PSLiteral) +PSKeywordTable = PSSymbolTable(PSKeyword) +LIT = PSLiteralTable.intern +KWD = PSKeywordTable.intern +KEYWORD_BRACE_BEGIN = KWD('{') +KEYWORD_BRACE_END = KWD('}') +KEYWORD_ARRAY_BEGIN = KWD('[') +KEYWORD_ARRAY_END = KWD(']') +KEYWORD_DICT_BEGIN = KWD('<<') +KEYWORD_DICT_END = KWD('>>') + + +def literal_name(x): + if not isinstance(x, PSLiteral): + if STRICT: + raise PSTypeError('Literal required: %r' % x) + else: + return str(x) + return x.name + +def keyword_name(x): + if not isinstance(x, PSKeyword): + if STRICT: + raise PSTypeError('Keyword required: %r' % x) + else: + return str(x) + return x.name + + +## PSBaseParser +## +EOL = re.compile(r'[\r\n]') +SPC = re.compile(r'\s') +NONSPC = re.compile(r'\S') +HEX = re.compile(r'[0-9a-fA-F]') +END_LITERAL = re.compile(r'[#/%\[\]()<>{}\s]') +END_HEX_STRING = re.compile(r'[^\s0-9a-fA-F]') +HEX_PAIR = re.compile(r'[0-9a-fA-F]{2}|.') +END_NUMBER = re.compile(r'[^0-9]') +END_KEYWORD = re.compile(r'[#/%\[\]()<>{}\s]') +END_STRING = re.compile(r'[()\134]') +OCT_STRING = re.compile(r'[0-7]') +ESC_STRING = { 'b':8, 't':9, 'n':10, 'f':12, 'r':13, '(':40, ')':41, '\\':92 } +class PSBaseParser(object): + + ''' + Most basic PostScript parser that performs only basic tokenization. + ''' + BUFSIZ = 4096 + + def __init__(self, fp): + self.fp = fp + self.seek(0) + return + + def __repr__(self): + return '' % (self.fp, self.bufpos) + + def flush(self): + return + + def close(self): + self.flush() + return + + def tell(self): + return self.bufpos+self.charpos + + def poll(self, pos=None, n=80): + pos0 = self.fp.tell() + if not pos: + pos = self.bufpos+self.charpos + self.fp.seek(pos) + print >>sys.stderr, 'poll(%d): %r' % (pos, self.fp.read(n)) + self.fp.seek(pos0) + return + + def seek(self, pos): + ''' + Seeks the parser to the given position. + ''' + self.fp.seek(pos) + # reset the status for nextline() + self.bufpos = pos + self.buf = '' + self.charpos = 0 + # reset the status for nexttoken() + self.parse1 = self.parse_main + self.tokens = [] + return + + def fillbuf(self): + if self.charpos < len(self.buf): return + # fetch next chunk. + self.bufpos = self.fp.tell() + self.buf = self.fp.read(self.BUFSIZ) + if not self.buf: + raise PSEOF('Unexpected EOF') + self.charpos = 0 + return + + def parse_main(self, s, i): + m = NONSPC.search(s, i) + if not m: + return (self.parse_main, len(s)) + j = m.start(0) + c = s[j] + self.tokenstart = self.bufpos+j + if c == '%': + self.token = '%' + return (self.parse_comment, j+1) + if c == '/': + self.token = '' + return (self.parse_literal, j+1) + if c in '-+' or c.isdigit(): + self.token = c + return (self.parse_number, j+1) + if c == '.': + self.token = c + return (self.parse_float, j+1) + if c.isalpha(): + self.token = c + return (self.parse_keyword, j+1) + if c == '(': + self.token = '' + self.paren = 1 + return (self.parse_string, j+1) + if c == '<': + self.token = '' + return (self.parse_wopen, j+1) + if c == '>': + self.token = '' + return (self.parse_wclose, j+1) + self.add_token(KWD(c)) + return (self.parse_main, j+1) + + def add_token(self, obj): + self.tokens.append((self.tokenstart, obj)) + return + + def parse_comment(self, s, i): + m = EOL.search(s, i) + if not m: + self.token += s[i:] + return (self.parse_comment, len(s)) + j = m.start(0) + self.token += s[i:j] + # We ignore comments. + #self.tokens.append(self.token) + return (self.parse_main, j) + + def parse_literal(self, s, i): + m = END_LITERAL.search(s, i) + if not m: + self.token += s[i:] + return (self.parse_literal, len(s)) + j = m.start(0) + self.token += s[i:j] + c = s[j] + if c == '#': + self.hex = '' + return (self.parse_literal_hex, j+1) + self.add_token(LIT(self.token)) + return (self.parse_main, j) + + def parse_literal_hex(self, s, i): + c = s[i] + if HEX.match(c) and len(self.hex) < 2: + self.hex += c + return (self.parse_literal_hex, i+1) + if self.hex: + self.token += chr(int(self.hex, 16)) + return (self.parse_literal, i) + + def parse_number(self, s, i): + m = END_NUMBER.search(s, i) + if not m: + self.token += s[i:] + return (self.parse_number, len(s)) + j = m.start(0) + self.token += s[i:j] + c = s[j] + if c == '.': + self.token += c + return (self.parse_float, j+1) + try: + self.add_token(int(self.token)) + except ValueError: + pass + return (self.parse_main, j) + def parse_float(self, s, i): + m = END_NUMBER.search(s, i) + if not m: + self.token += s[i:] + return (self.parse_float, len(s)) + j = m.start(0) + self.token += s[i:j] + self.add_token(float(self.token)) + return (self.parse_main, j) + + def parse_keyword(self, s, i): + m = END_KEYWORD.search(s, i) + if not m: + self.token += s[i:] + return (self.parse_keyword, len(s)) + j = m.start(0) + self.token += s[i:j] + if self.token == 'true': + token = True + elif self.token == 'false': + token = False + else: + token = KWD(self.token) + self.add_token(token) + return (self.parse_main, j) + + def parse_string(self, s, i): + m = END_STRING.search(s, i) + if not m: + self.token += s[i:] + return (self.parse_string, len(s)) + j = m.start(0) + self.token += s[i:j] + c = s[j] + if c == '\\': + self.oct = '' + return (self.parse_string_1, j+1) + if c == '(': + self.paren += 1 + self.token += c + return (self.parse_string, j+1) + if c == ')': + self.paren -= 1 + if self.paren: + self.token += c + return (self.parse_string, j+1) + self.add_token(self.token) + return (self.parse_main, j+1) + def parse_string_1(self, s, i): + c = s[i] + if OCT_STRING.match(c) and len(self.oct) < 3: + self.oct += c + return (self.parse_string_1, i+1) + if self.oct: + self.token += chr(int(self.oct, 8)) + return (self.parse_string, i) + if c in ESC_STRING: + self.token += chr(ESC_STRING[c]) + return (self.parse_string, i+1) + + def parse_wopen(self, s, i): + c = s[i] + if c.isspace() or HEX.match(c): + return (self.parse_hexstring, i) + if c == '<': + self.add_token(KEYWORD_DICT_BEGIN) + i += 1 + return (self.parse_main, i) + + def parse_wclose(self, s, i): + c = s[i] + if c == '>': + self.add_token(KEYWORD_DICT_END) + i += 1 + return (self.parse_main, i) + + def parse_hexstring(self, s, i): + m = END_HEX_STRING.search(s, i) + if not m: + self.token += s[i:] + return (self.parse_hexstring, len(s)) + j = m.start(0) + self.token += s[i:j] + token = HEX_PAIR.sub(lambda m: chr(int(m.group(0), 16)), + SPC.sub('', self.token)) + self.add_token(token) + return (self.parse_main, j) + + def nexttoken(self): + while not self.tokens: + self.fillbuf() + (self.parse1, self.charpos) = self.parse1(self.buf, self.charpos) + token = self.tokens.pop(0) + return token + + def nextline(self): + ''' + Fetches a next line that ends either with \\r or \\n. + ''' + linebuf = '' + linepos = self.bufpos + self.charpos + eol = False + while 1: + self.fillbuf() + if eol: + c = self.buf[self.charpos] + # handle '\r\n' + if c == '\n': + linebuf += c + self.charpos += 1 + break + m = EOL.search(self.buf, self.charpos) + if m: + linebuf += self.buf[self.charpos:m.end(0)] + self.charpos = m.end(0) + if linebuf[-1] == '\r': + eol = True + else: + break + else: + linebuf += self.buf[self.charpos:] + self.charpos = len(self.buf) + return (linepos, linebuf) + + def revreadlines(self): + ''' + Fetches a next line backword. This is used to locate + the trailers at the end of a file. + ''' + self.fp.seek(0, 2) + pos = self.fp.tell() + buf = '' + while 0 < pos: + pos = max(0, pos-self.BUFSIZ) + self.fp.seek(pos) + s = self.fp.read(self.BUFSIZ) + if not s: break + while 1: + n = max(s.rfind('\r'), s.rfind('\n')) + if n == -1: + buf = s + buf + break + yield buf+s[n:] + s = s[:n] + buf = '' + return + + +## PSStackParser +## +class PSStackParser(PSBaseParser): + + def __init__(self, fp): + PSBaseParser.__init__(self, fp) + self.reset() + return + + def reset(self): + self.context = [] + self.curtype = None + self.curstack = [] + self.results = [] + return + + def seek(self, pos): + PSBaseParser.seek(self, pos) + self.reset() + return + + def push(self, *objs): + self.curstack.extend(objs) + return + def pop(self, n): + objs = self.curstack[-n:] + self.curstack[-n:] = [] + return objs + def popall(self): + objs = self.curstack + self.curstack = [] + return objs + def add_results(self, *objs): + self.results.extend(objs) + return + + def start_type(self, pos, type): + self.context.append((pos, self.curtype, self.curstack)) + (self.curtype, self.curstack) = (type, []) + return + def end_type(self, type): + if self.curtype != type: + raise PSTypeError('Type mismatch: %r != %r' % (self.curtype, type)) + objs = [ obj for (_,obj) in self.curstack ] + (pos, self.curtype, self.curstack) = self.context.pop() + return (pos, objs) + + def do_keyword(self, pos, token): + return + + def nextobject(self): + ''' + Yields a list of objects: keywords, literals, strings, + numbers, arrays and dictionaries. Arrays and dictionaries + are represented as Python sequence and dictionaries. + ''' + while not self.results: + (pos, token) = self.nexttoken() + #print (pos,token), (self.curtype, self.curstack) + if (isinstance(token, int) or + isinstance(token, float) or + isinstance(token, bool) or + isinstance(token, str) or + isinstance(token, PSLiteral)): + # normal token + self.push((pos, token)) + elif token == KEYWORD_ARRAY_BEGIN: + # begin array + self.start_type(pos, 'a') + elif token == KEYWORD_ARRAY_END: + # end array + try: + self.push(self.end_type('a')) + except PSTypeError: + if STRICT: raise + elif token == KEYWORD_DICT_BEGIN: + # begin dictionary + self.start_type(pos, 'd') + elif token == KEYWORD_DICT_END: + # end dictionary + try: + (pos, objs) = self.end_type('d') + if len(objs) % 2 != 0: + raise PSSyntaxError( + 'Invalid dictionary construct: %r' % objs) + d = dict((literal_name(k), v) \ + for (k,v) in choplist(2, objs)) + self.push((pos, d)) + except PSTypeError: + if STRICT: raise + else: + self.do_keyword(pos, token) + if self.context: + continue + else: + self.flush() + obj = self.results.pop(0) + return obj + + +## PDF Objects +## +class PDFObject(PSObject): pass + +class PDFException(PSException): pass +class PDFTypeError(PDFException): pass +class PDFValueError(PDFException): pass +class PDFNotImplementedError(PSException): pass + + +## PDFObjRef +## +class PDFObjRef(PDFObject): + + def __init__(self, doc, objid, _): + if objid == 0: + if STRICT: + raise PDFValueError('PDF object id cannot be 0.') + self.doc = doc + self.objid = objid + #self.genno = genno # Never used. + return + + def __repr__(self): + return '' % (self.objid) + + def resolve(self): + return self.doc.getobj(self.objid) + + +# resolve +def resolve1(x): + ''' + Resolve an object. If this is an array or dictionary, + it may still contains some indirect objects inside. + ''' + while isinstance(x, PDFObjRef): + x = x.resolve() + return x + +def resolve_all(x): + ''' + Recursively resolve X and all the internals. + Make sure there is no indirect reference within the nested object. + This procedure might be slow. + ''' + while isinstance(x, PDFObjRef): + x = x.resolve() + if isinstance(x, list): + x = [ resolve_all(v) for v in x ] + elif isinstance(x, dict): + for (k,v) in x.iteritems(): + x[k] = resolve_all(v) + return x + +def decipher_all(decipher, objid, genno, x): + ''' + Recursively decipher X. + ''' + if isinstance(x, str): + return decipher(objid, genno, x) + if isinstance(x, list): + x = [ decipher_all(decipher, objid, genno, v) for v in x ] + elif isinstance(x, dict): + for (k,v) in x.iteritems(): + x[k] = decipher_all(decipher, objid, genno, v) + return x + +# Type cheking +def int_value(x): + x = resolve1(x) + if not isinstance(x, int): + if STRICT: + raise PDFTypeError('Integer required: %r' % x) + return 0 + return x + +def float_value(x): + x = resolve1(x) + if not isinstance(x, float): + if STRICT: + raise PDFTypeError('Float required: %r' % x) + return 0.0 + return x + +def num_value(x): + x = resolve1(x) + if not (isinstance(x, int) or isinstance(x, float)): + if STRICT: + raise PDFTypeError('Int or Float required: %r' % x) + return 0 + return x + +def str_value(x): + x = resolve1(x) + if not isinstance(x, str): + if STRICT: + raise PDFTypeError('String required: %r' % x) + return '' + return x + +def list_value(x): + x = resolve1(x) + if not (isinstance(x, list) or isinstance(x, tuple)): + if STRICT: + raise PDFTypeError('List required: %r' % x) + return [] + return x + +def dict_value(x): + x = resolve1(x) + if not isinstance(x, dict): + if STRICT: + raise PDFTypeError('Dict required: %r' % x) + return {} + return x + +def stream_value(x): + x = resolve1(x) + if not isinstance(x, PDFStream): + if STRICT: + raise PDFTypeError('PDFStream required: %r' % x) + return PDFStream({}, '') + return x + + +## PDFStream type +## +class PDFStream(PDFObject): + + def __init__(self, dic, rawdata, decipher=None): + length = int_value(dic.get('Length', 0)) + eol = rawdata[length:] + if eol in ('\r', '\n', '\r\n'): + rawdata = rawdata[:length] + if length != len(rawdata): + print >>sys.stderr, "[warning] data length mismatch" + self.dic = dic + self.rawdata = rawdata + self.decipher = decipher + self.data = None + self.objid = None + self.genno = None + return + + def set_objid(self, objid, genno): + self.objid = objid + self.genno = genno + return + + def __repr__(self): + return '' % \ + (self.objid, len(self.rawdata), self.dic) + + def get_rawdata(self): + return self.rawdata + + def get_decdata(self): + data = self.rawdata + if self.decipher: + # Handle encryption + data = self.decipher(self.objid, self.genno, data) + return data + + +## PDF Exceptions +## +class PDFSyntaxError(PDFException): pass +class PDFNoValidXRef(PDFSyntaxError): pass +class PDFEncryptionError(PDFException): pass +class PDFPasswordIncorrect(PDFEncryptionError): pass + +# some predefined literals and keywords. +LITERAL_OBJSTM = PSLiteralTable.intern('ObjStm') +LITERAL_XREF = PSLiteralTable.intern('XRef') +LITERAL_PAGE = PSLiteralTable.intern('Page') +LITERAL_PAGES = PSLiteralTable.intern('Pages') +LITERAL_CATALOG = PSLiteralTable.intern('Catalog') + + +## XRefs +## + +## PDFXRef +## +class PDFXRef(object): + + def __init__(self): + self.offsets = None + return + + def __repr__(self): + return '' % len(self.offsets) + + def objids(self): + return self.offsets.iterkeys() + + def load(self, parser): + self.offsets = {} + while 1: + try: + (pos, line) = parser.nextline() + except PSEOF: + raise PDFNoValidXRef('Unexpected EOF - file corrupted?') + if not line: + raise PDFNoValidXRef('Premature eof: %r' % parser) + if line.startswith('trailer'): + parser.seek(pos) + break + f = line.strip().split(' ') + if len(f) != 2: + raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line)) + try: + (start, nobjs) = map(long, f) + except ValueError: + raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line)) + for objid in xrange(start, start+nobjs): + try: + (_, line) = parser.nextline() + except PSEOF: + raise PDFNoValidXRef('Unexpected EOF - file corrupted?') + f = line.strip().split(' ') + if len(f) != 3: + raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line)) + (pos, genno, use) = f + if use != 'n': continue + self.offsets[objid] = (int(genno), long(pos)) + self.load_trailer(parser) + return + + KEYWORD_TRAILER = PSKeywordTable.intern('trailer') + def load_trailer(self, parser): + try: + (_,kwd) = parser.nexttoken() + assert kwd is self.KEYWORD_TRAILER + (_,dic) = parser.nextobject() + except PSEOF: + x = parser.pop(1) + if not x: + raise PDFNoValidXRef('Unexpected EOF - file corrupted') + (_,dic) = x[0] + self.trailer = dict_value(dic) + return + + def getpos(self, objid): + try: + (genno, pos) = self.offsets[objid] + except KeyError: + raise + return (None, pos) + + +## PDFDocument +## +## A PDFDocument object represents a PDF document. +## Since a PDF file is usually pretty big, normally it is not loaded +## at once. Rather it is parsed dynamically as processing goes. +## A PDF parser is associated with the document. +## +class PDFDocument(object): + + def __init__(self): + self.xrefs = [] + self.objs = {} + self.parsed_objs = {} + self.root = None + self.catalog = None + self.parser = None + self.encryption = None + self.decipher = None + self.ready = False + return + + # set_parser(parser) + # Associates the document with an (already initialized) parser object. + def set_parser(self, parser): + if self.parser: return + self.parser = parser + # The document is set to be temporarily ready during collecting + # all the basic information about the document, e.g. + # the header, the encryption information, and the access rights + # for the document. + self.ready = True + # Retrieve the information of each header that was appended + # (maybe multiple times) at the end of the document. + self.xrefs = parser.read_xref() + for xref in self.xrefs: + trailer = xref.trailer + if not trailer: continue + # If there's an encryption info, remember it. + if 'Encrypt' in trailer: + #assert not self.encryption + self.encryption = (list_value(trailer['ID']), + dict_value(trailer['Encrypt'])) + if 'Root' in trailer: + self.set_root(dict_value(trailer['Root'])) + break + else: + raise PDFSyntaxError('No /Root object! - Is this really a PDF?') + # The document is set to be non-ready again, until all the + # proper initialization (asking the password key and + # verifying the access permission, so on) is finished. + self.ready = False + return + + # set_root(root) + # Set the Root dictionary of the document. + # Each PDF file must have exactly one /Root dictionary. + def set_root(self, root): + self.root = root + self.catalog = dict_value(self.root) + if self.catalog.get('Type') is not LITERAL_CATALOG: + if STRICT: + raise PDFSyntaxError('Catalog not found!') + return + + # initialize(password='') + # Perform the initialization with a given password. + # This step is mandatory even if there's no password associated + # with the document. + def initialize(self, password=''): + if not self.encryption: + self.is_printable = self.is_modifiable = self.is_extractable = True + self.ready = True + return + (docid, param) = self.encryption + type = literal_name(param['Filter']) + if type == 'Standard': + return self.initialize_standard(password, docid, param) + if type == 'EBX_HANDLER': + return self.initialize_ebx(password, docid, param) + raise PDFEncryptionError('Unknown filter: param=%r' % param) + + def initialize_ebx(self, password, docid, param): + V = int_value(param.get('V', 0)) + self.is_printable = self.is_modifiable = self.is_extractable = True + with open(password, 'rb') as f: + keyder = f.read() + key = ASN1Parser([ord(x) for x in keyder]) + key = [bytesToNumber(key.getChild(x).value) for x in xrange(1, 4)] + rsa = RSA.construct(key) + length = int_value(param.get('Length')) / 8 + rights = str_value(param.get('ADEPT_LICENSE')).decode('base64') + rights = zlib.decompress(rights, -15) + rights = etree.fromstring(rights) + expr = './/{http://ns.adobe.com/adept}encryptedKey' + bookkey = ''.join(rights.findtext(expr)).decode('base64') + bookkey = rsa.decrypt(bookkey) + if bookkey[0] != '\x02': + raise ADEPTError('error decrypting book session key') + index = bookkey.index('\0') + 1 + if V == 3: + if bookkey[index] != '\x03': + raise ADEPTError('error decrypting book session key') + index += 1 + bookkey = bookkey[index:] + if len(bookkey) != length: + raise ADEPTError('error decrypting book session key') + self.decrypt_key = bookkey[-length:] + self.genkey = self.genkey_v3 if V == 3 else self.genkey_v2 + self.decipher = self.decrypt_rc4 + self.ready = True + return + + PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..' \ + '\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz' + def initialize_standard(self, password, docid, param): + V = int_value(param.get('V', 0)) + if not (V == 1 or V == 2): + raise PDFEncryptionError('Unknown algorithm: param=%r' % param) + length = int_value(param.get('Length', 40)) # Key length (bits) + O = str_value(param['O']) + R = int_value(param['R']) # Revision + if 5 <= R: + raise PDFEncryptionError('Unknown revision: %r' % R) + U = str_value(param['U']) + P = int_value(param['P']) + self.is_printable = bool(P & 4) + self.is_modifiable = bool(P & 8) + self.is_extractable = bool(P & 16) + # Algorithm 3.2 + password = (password+self.PASSWORD_PADDING)[:32] # 1 + hash = hashlib.md5(password) # 2 + hash.update(O) # 3 + hash.update(struct.pack('' + + KEYWORD_R = PSKeywordTable.intern('R') + KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj') + KEYWORD_STREAM = PSKeywordTable.intern('stream') + KEYWORD_XREF = PSKeywordTable.intern('xref') + KEYWORD_STARTXREF = PSKeywordTable.intern('startxref') + def do_keyword(self, pos, token): + if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF): + self.add_results(*self.pop(1)) + return + if token is self.KEYWORD_ENDOBJ: + self.add_results(*self.pop(4)) + return + + if token is self.KEYWORD_R: + # reference to indirect object + try: + ((_,objid), (_,genno)) = self.pop(2) + (objid, genno) = (int(objid), int(genno)) + obj = PDFObjRef(self.doc, objid, genno) + self.push((pos, obj)) + except PSSyntaxError: + pass + return + + if token is self.KEYWORD_STREAM: + # stream object + ((_,dic),) = self.pop(1) + dic = dict_value(dic) + try: + objlen = int_value(dic['Length']) + except KeyError: + if STRICT: + raise PDFSyntaxError('/Length is undefined: %r' % dic) + objlen = 0 + self.seek(pos) + try: + (_, line) = self.nextline() # 'stream' + except PSEOF: + if STRICT: + raise PDFSyntaxError('Unexpected EOF') + return + pos += len(line) + self.fp.seek(pos) + data = self.fp.read(objlen) + self.seek(pos+objlen) + while 1: + try: + (linepos, line) = self.nextline() + except PSEOF: + if STRICT: + raise PDFSyntaxError('Unexpected EOF') + break + if 'endstream' in line: + i = line.index('endstream') + objlen += i + data += line[:i] + break + objlen += len(line) + data += line + self.seek(pos+objlen) + obj = PDFStream(dic, data, self.doc.decipher) + self.push((pos, obj)) + return + + # others + self.push((pos, token)) + return + + def find_xref(self): + # search the last xref table by scanning the file backwards. + prev = None + for line in self.revreadlines(): + line = line.strip() + if line == 'startxref': break + if line: + prev = line + else: + raise PDFNoValidXRef('Unexpected EOF') + return long(prev) + + # read xref table + def read_xref_from(self, start, xrefs): + self.seek(start) + self.reset() + try: + (pos, token) = self.nexttoken() + except PSEOF: + raise PDFNoValidXRef('Unexpected EOF') + if token is not self.KEYWORD_XREF: + raise PDFNoValidXRef('xref not found: pos=%d, token=%r' % + (pos, token)) + self.nextline() + xref = PDFXRef() + xref.load(self) + xrefs.append(xref) + trailer = xref.trailer + if 'Prev' in trailer: + # find previous xref + pos = int_value(trailer['Prev']) + self.read_xref_from(pos, xrefs) + return + + # read xref tables and trailers + def read_xref(self): + xrefs = [] + try: + pos = self.find_xref() + self.read_xref_from(pos, xrefs) + except PDFNoValidXRef: + # fallback + self.seek(0) + pat = re.compile(r'^(\d+)\s+(\d+)\s+obj\b') + offsets = {} + xref = PDFXRef() + while 1: + try: + (pos, line) = self.nextline() + except PSEOF: + break + if line.startswith('trailer'): break + m = pat.match(line) + if not m: continue + (objid, genno) = m.groups() + offsets[int(objid)] = (0, pos) + if not offsets: raise + xref.offsets = offsets + self.seek(pos) + xref.load_trailer(self) + xrefs.append(xref) + return xrefs + + +### +### My own code, for which there is none else to blame + +class PDFSerializer(object): + def __init__(self, inf, keypath): + self.version = inf.read(8) + inf.seek(0) + self.doc = doc = PDFDocument() + parser = PDFParser(doc, inf) + doc.initialize(keypath) + self.objids = objids = set() + for xref in doc.xrefs: + trailer = xref.trailer + for objid in xref.objids(): + objids.add(objid) + if 'Encrypt' in trailer: + objids.remove(trailer.pop('Encrypt').objid) + self.trailer = trailer + + def dump(self, outf): + self.outf = outf + self.write(self.version) + self.write('\n%\xe2\xe3\xcf\xd3\n') + doc = self.doc + objids = self.objids + xrefs = {} + maxobj = max(objids) + for objid in objids: + xrefs[objid] = self.tell() + self.serialize_indirect(objid, doc.getobj(objid)) + startxref = self.tell() + self.write('xref\n') + self.write('0 %d\n' % (maxobj + 1,)) + for objid in xrange(0, maxobj + 1): + if objid in objids: + self.write("%010d %05d n \n" % (xrefs[objid], 0)) + else: + self.write("%010d %05d f \n" % (0, 65535)) + self.write('trailer\n') + self.serialize_object(self.trailer) + self.write('\nstartxref\n%d\n%%%%EOF' % startxref) + + def write(self, *data): + for datum in data: + self.outf.write(datum) + self.last = data[-1][-1:] + + def tell(self): + return self.outf.tell() + + def escape_string(self, string): + string = string.replace('\\', '\\\\') + string = string.replace('\n', r'\n') + string = string.replace('(', r'\(') + string = string.replace(')', r'\)') + return string + + def serialize_object(self, obj): + if isinstance(obj, dict): + self.write('<<') + for key, val in obj.items(): + self.write('/%s' % key) + self.serialize_object(val) + self.write('>>') + elif isinstance(obj, list): + self.write('[') + for val in obj: + self.serialize_object(val) + self.write(']') + elif isinstance(obj, str): + self.write('(%s)' % self.escape_string(obj)) + elif isinstance(obj, bool): + if self.last.isalnum(): + self.write(' ') + self.write(str(obj).lower()) + elif isinstance(obj, (int, long, float)): + if self.last.isalnum(): + self.write(' ') + self.write(str(obj)) + elif isinstance(obj, PDFObjRef): + if self.last.isalnum(): + self.write(' ') + self.write('%d %d R' % (obj.objid, 0)) + elif isinstance(obj, PDFStream): + data = obj.get_decdata() + self.serialize_object(obj.dic) + self.write('stream\n') + self.write(data) + self.write('\nendstream') + else: + data = str(obj) + if data[0].isalnum() and self.last.isalnum(): + self.write(' ') + self.write(data) + + def serialize_indirect(self, objid, obj): + self.write('%d 0 obj' % (objid,)) + self.serialize_object(obj) + if self.last.isalnum(): + self.write('\n') + self.write('endobj\n') + +def cli_main(argv=sys.argv): + progname = os.path.basename(argv[0]) + if RSA is None: + print "%s: This script requires PyCrypto, which must be installed " \ + "separately. Read the top-of-script comment for details." % \ + (progname,) + return 1 + if len(argv) != 4: + print "usage: %s KEYFILE INBOOK OUTBOOK" % (progname,) + return 1 + keypath, inpath, outpath = argv[1:] + with open(inpath, 'rb') as inf: + serializer = PDFSerializer(inf, keypath) + with open(outpath, 'wb') as outf: + serializer.dump(outf) + return 0 + + +class DecryptionDialog(Tkinter.Frame): + def __init__(self, root): + Tkinter.Frame.__init__(self, root, border=5) + self.status = Tkinter.Label(self, text='Select files for decryption') + self.status.pack(fill=Tkconstants.X, expand=1) + body = Tkinter.Frame(self) + body.pack(fill=Tkconstants.X, expand=1) + sticky = Tkconstants.E + Tkconstants.W + body.grid_columnconfigure(1, weight=2) + Tkinter.Label(body, text='Key file').grid(row=0) + self.keypath = Tkinter.Entry(body, width=30) + self.keypath.grid(row=0, column=1, sticky=sticky) + if os.path.exists('adeptkey.der'): + self.keypath.insert(0, 'adeptkey.der') + button = Tkinter.Button(body, text="...", command=self.get_keypath) + button.grid(row=0, column=2) + Tkinter.Label(body, text='Input file').grid(row=1) + self.inpath = Tkinter.Entry(body, width=30) + self.inpath.grid(row=1, column=1, sticky=sticky) + button = Tkinter.Button(body, text="...", command=self.get_inpath) + button.grid(row=1, column=2) + Tkinter.Label(body, text='Output file').grid(row=2) + self.outpath = Tkinter.Entry(body, width=30) + self.outpath.grid(row=2, column=1, sticky=sticky) + button = Tkinter.Button(body, text="...", command=self.get_outpath) + button.grid(row=2, column=2) + buttons = Tkinter.Frame(self) + buttons.pack() + botton = Tkinter.Button( + buttons, text="Decrypt", width=10, command=self.decrypt) + botton.pack(side=Tkconstants.LEFT) + Tkinter.Frame(buttons, width=10).pack(side=Tkconstants.LEFT) + button = Tkinter.Button( + buttons, text="Quit", width=10, command=self.quit) + button.pack(side=Tkconstants.RIGHT) + + def get_keypath(self): + keypath = tkFileDialog.askopenfilename( + parent=None, title='Select ADEPT key file', + defaultextension='.der', filetypes=[('DER-encoded files', '.der'), + ('All Files', '.*')]) + if keypath: + keypath = os.path.normpath(keypath) + self.keypath.delete(0, Tkconstants.END) + self.keypath.insert(0, keypath) + return + + def get_inpath(self): + inpath = tkFileDialog.askopenfilename( + parent=None, title='Select ADEPT-encrypted PDF file to decrypt', + defaultextension='.epub', filetypes=[('PDF files', '.pdf'), + ('All files', '.*')]) + if inpath: + inpath = os.path.normpath(inpath) + self.inpath.delete(0, Tkconstants.END) + self.inpath.insert(0, inpath) + return + + def get_outpath(self): + outpath = tkFileDialog.asksaveasfilename( + parent=None, title='Select unencrypted PDF file to produce', + defaultextension='.epub', filetypes=[('PDF files', '.pdf'), + ('All files', '.*')]) + if outpath: + outpath = os.path.normpath(outpath) + self.outpath.delete(0, Tkconstants.END) + self.outpath.insert(0, outpath) + return + + def decrypt(self): + keypath = self.keypath.get() + inpath = self.inpath.get() + outpath = self.outpath.get() + if not keypath or not os.path.exists(keypath): + self.status['text'] = 'Specified key file does not exist' + return + if not inpath or not os.path.exists(inpath): + self.status['text'] = 'Specified input file does not exist' + return + if not outpath: + self.status['text'] = 'Output file not specified' + return + if inpath == outpath: + self.status['text'] = 'Must have different input and output files' + return + argv = [sys.argv[0], keypath, inpath, outpath] + self.status['text'] = 'Decrypting...' + try: + cli_main(argv) + except Exception, e: + self.status['text'] = 'Error: ' + str(e) + return + self.status['text'] = 'File successfully decrypted' + +def gui_main(): + root = Tkinter.Tk() + if RSA is None: + root.withdraw() + tkMessageBox.showerror( + "INEPT PDF Decrypter", + "This script requires PyCrypto, which must be installed " + "separately. Read the top-of-script comment for details.") + return 1 + root.title('INEPT PDF Decrypter') + root.resizable(True, False) + root.minsize(300, 0) + DecryptionDialog(root).pack(fill=Tkconstants.X, expand=1) + root.mainloop() + return 0 + + +if __name__ == '__main__': + # sys.exit(cli_main()) + sys.exit(gui_main())