DeDRM_tools/DeDRM_plugin/erdr2pml.py

592 lines
24 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# erdr2pml.py
# Copyright © 2008-2020 The Dark Reverser, Apprentice Harper et al.
#
# Changelog
#
# Based on ereader2html version 0.08 plus some later small fixes
#
# 0.01 - Initial version
# 0.02 - Support more eReader files. Support bold text and links. Fix PML decoder parsing bug.
# 0.03 - Fix incorrect variable usage at one place.
# 0.03b - enhancement by DeBockle (version 259 support)
# Custom version 0.03 - no change to eReader support, only usability changes
# - start of pep-8 indentation (spaces not tab), fix trailing blanks
# - version variable, only one place to change
# - added main routine, now callable as a library/module,
# means tools can add optional support for ereader2html
# - outdir is no longer a mandatory parameter (defaults based on input name if missing)
# - time taken output to stdout
# - Psyco support - reduces runtime by a factor of (over) 3!
# E.g. (~600Kb file) 90 secs down to 24 secs
# - newstyle classes
# - changed map call to list comprehension
# may not work with python 2.3
# without Psyco this reduces runtime to 90%
# E.g. 90 secs down to 77 secs
# Psyco with map calls takes longer, do not run with map in Psyco JIT!
# - izip calls used instead of zip (if available), further reduction
# in run time (factor of 4.5).
# E.g. (~600Kb file) 90 secs down to 20 secs
# - Python 2.6+ support, avoid DeprecationWarning with sha/sha1
# 0.04 - Footnote support, PML output, correct charset in html, support more PML tags
# - Feature change, dump out PML file
# - Added supprt for footnote tags. NOTE footnote ids appear to be bad (not usable)
# in some pdb files :-( due to the same id being used multiple times
# - Added correct charset encoding (pml is based on cp1252)
# - Added logging support.
# 0.05 - Improved type 272 support for sidebars, links, chapters, metainfo, etc
# 0.06 - Merge of 0.04 and 0.05. Improved HTML output
# Placed images in subfolder, so that it's possible to just
# drop the book.pml file onto DropBook to make an unencrypted
# copy of the eReader file.
# Using that with Calibre works a lot better than the HTML
# conversion in this code.
# 0.07 - Further Improved type 272 support for sidebars with all earlier fixes
# 0.08 - fixed typos, removed extraneous things
# 0.09 - fixed typos in first_pages to first_page to again support older formats
# 0.10 - minor cleanups
# 0.11 - fixups for using correct xml for footnotes and sidebars for use with Dropbook
# 0.12 - Fix added to prevent lowercasing of image names when the pml code itself uses a different case in the link name.
# 0.13 - change to unbuffered stdout for use with gui front ends
# 0.14 - contributed enhancement to support --make-pmlz switch
# 0.15 - enabled high-ascii to pml character encoding. DropBook now works on Mac.
# 0.16 - convert to use openssl DES (very very fast) or pure python DES if openssl's libcrypto is not available
# 0.17 - added support for pycrypto's DES as well
# 0.18 - on Windows try PyCrypto first and OpenSSL next
# 0.19 - Modify the interface to allow use of import
# 0.20 - modify to allow use inside new interface for calibre plugins
# 0.21 - Support eReader (drm) version 11.
# - Don't reject dictionary format.
# - Ignore sidebars for dictionaries (different format?)
# 0.22 - Unicode and plugin support, different image folders for PMLZ and source
# 0.23 - moved unicode_argv call inside main for Windows DeDRM compatibility
# 1.00 - Added Python 3 compatibility for calibre 5.0
__version__='1.00'
import sys, re
import struct, binascii, getopt, zlib, os, os.path, urllib, tempfile, traceback
if 'calibre' in sys.modules:
inCalibre = True
else:
inCalibre = False
# Wrap a stream so that output gets flushed immediately
# and also make sure that any unicode strings get
# encoded using "replace" before writing them.
class SafeUnbuffered:
def __init__(self, stream):
self.stream = stream
self.encoding = stream.encoding
if self.encoding == None:
self.encoding = "utf-8"
def write(self, data):
if isinstance(data,str):
data = data.encode(self.encoding,"replace")
self.stream.buffer.write(data)
self.stream.buffer.flush()
def __getattr__(self, attr):
return getattr(self.stream, attr)
iswindows = sys.platform.startswith('win')
isosx = sys.platform.startswith('darwin')
def unicode_argv():
if iswindows:
# Uses shell32.GetCommandLineArgvW to get sys.argv as a list of Unicode
# strings.
# Versions 2.x of Python don't support Unicode in sys.argv on
# Windows, with the underlying Windows API instead replacing multi-byte
# characters with '?'.
from ctypes import POINTER, byref, cdll, c_int, windll
from ctypes.wintypes import LPCWSTR, LPWSTR
GetCommandLineW = cdll.kernel32.GetCommandLineW
GetCommandLineW.argtypes = []
GetCommandLineW.restype = LPCWSTR
CommandLineToArgvW = windll.shell32.CommandLineToArgvW
CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)]
CommandLineToArgvW.restype = POINTER(LPWSTR)
cmd = GetCommandLineW()
argc = c_int(0)
argv = CommandLineToArgvW(cmd, byref(argc))
if argc.value > 0:
# Remove Python executable and commands if present
start = argc.value - len(sys.argv)
return [argv[i] for i in
range(start, argc.value)]
# if we don't have any arguments at all, just pass back script name
# this should never happen
return ["mobidedrm.py"]
else:
argvencoding = sys.stdin.encoding or "utf-8"
return [arg if isinstance(arg, str) else str(arg, argvencoding) for arg in sys.argv]
Des = None
if iswindows:
# first try with pycrypto
if inCalibre:
from calibre_plugins.dedrm import pycrypto_des
else:
import pycrypto_des
Des = pycrypto_des.load_pycrypto()
if Des == None:
# they try with openssl
if inCalibre:
from calibre_plugins.dedrm import openssl_des
else:
import openssl_des
Des = openssl_des.load_libcrypto()
else:
# first try with openssl
if inCalibre:
from calibre_plugins.dedrm import openssl_des
else:
import openssl_des
Des = openssl_des.load_libcrypto()
if Des == None:
# then try with pycrypto
if inCalibre:
from calibre_plugins.dedrm import pycrypto_des
else:
import pycrypto_des
Des = pycrypto_des.load_pycrypto()
# if that did not work then use pure python implementation
# of DES and try to speed it up with Psycho
if Des == None:
if inCalibre:
from calibre_plugins.dedrm import python_des
else:
import python_des
Des = python_des.Des
# Import Psyco if available
try:
# http://psyco.sourceforge.net
import psyco
psyco.full()
except ImportError:
pass
try:
from hashlib import sha1
except ImportError:
# older Python release
import sha
sha1 = lambda s: sha.new(s)
import cgi
import logging
logging.basicConfig()
#logging.basicConfig(level=logging.DEBUG)
class Sectionizer(object):
bkType = "Book"
def __init__(self, filename, ident):
self.contents = open(filename, 'rb').read()
self.header = self.contents[0:72]
self.num_sections, = struct.unpack('>H', self.contents[76:78])
# Dictionary or normal content (TODO: Not hard-coded)
if self.header[0x3C:0x3C+8] != ident:
if self.header[0x3C:0x3C+8] == b"PDctPPrs":
self.bkType = "Dict"
else:
raise ValueError('Invalid file format')
self.sections = []
for i in range(self.num_sections):
offset, a1,a2,a3,a4 = struct.unpack('>LBBBB', self.contents[78+i*8:78+i*8+8])
flags, val = a1, a2<<16|a3<<8|a4
self.sections.append( (offset, flags, val) )
def loadSection(self, section):
if section + 1 == self.num_sections:
end_off = len(self.contents)
else:
end_off = self.sections[section + 1][0]
off = self.sections[section][0]
return self.contents[off:end_off]
# cleanup unicode filenames
# borrowed from calibre from calibre/src/calibre/__init__.py
# added in removal of control (<32) chars
# and removal of . at start and end
# and with some (heavily edited) code from Paul Durrant's kindlenamer.py
def sanitizeFileName(name):
# substitute filename unfriendly characters
name = name.replace("<","[").replace(">","]").replace(" : "," ").replace(": "," ").replace(":","").replace("/","_").replace("\\","_").replace("|","_").replace("\"","\'")
# delete control characters
name = "".join(char for char in name if ord(char)>=32)
# white space to single space, delete leading and trailing while space
name = re.sub(r"\s", " ", name).strip()
# remove leading dots
while len(name)>0 and name[0] == ".":
name = name[1:]
# remove trailing dots (Windows doesn't like them)
if name.endswith("."):
name = name[:-1]
return name
def fixKey(key):
def fixByte(b):
return b ^ ((b ^ (b<<1) ^ (b<<2) ^ (b<<3) ^ (b<<4) ^ (b<<5) ^ (b<<6) ^ (b<<7) ^ 0x80) & 0x80)
return bytes([fixByte(a) for a in key])
def deXOR(text, sp, table):
r=''
j = sp
for i in range(len(text)):
r += chr(ord(table[j]) ^ ord(text[i]))
j = j + 1
if j == len(table):
j = 0
return r
class EreaderProcessor(object):
def __init__(self, sect, user_key):
self.section_reader = sect.loadSection
data = self.section_reader(0)
version, = struct.unpack('>H', data[0:2])
self.version = version
logging.info('eReader file format version %s', version)
if version != 272 and version != 260 and version != 259:
raise ValueError('incorrect eReader version %d (error 1)' % version)
data = self.section_reader(1)
self.data = data
des = Des(fixKey(data[0:8]))
cookie_shuf, cookie_size = struct.unpack('>LL', des.decrypt(data[-8:]))
if cookie_shuf < 3 or cookie_shuf > 0x14 or cookie_size < 0xf0 or cookie_size > 0x200:
raise ValueError('incorrect eReader version (error 2)')
input = des.decrypt(data[-cookie_size:])
def unshuff(data, shuf):
r = [0] * len(data)
j = 0
for i in range(len(data)):
j = (j + shuf) % len(data)
r[j] = data[i]
assert len(bytes(r)) == len(data)
return bytes(r)
r = unshuff(input[0:-8], cookie_shuf)
drm_sub_version = struct.unpack('>H', r[0:2])[0]
self.num_text_pages = struct.unpack('>H', r[2:4])[0] - 1
self.num_image_pages = struct.unpack('>H', r[26:26+2])[0]
self.first_image_page = struct.unpack('>H', r[24:24+2])[0]
# Default values
self.num_footnote_pages = 0
self.num_sidebar_pages = 0
self.first_footnote_page = -1
self.first_sidebar_page = -1
if self.version == 272:
self.num_footnote_pages = struct.unpack('>H', r[46:46+2])[0]
self.first_footnote_page = struct.unpack('>H', r[44:44+2])[0]
if (sect.bkType == "Book"):
self.num_sidebar_pages = struct.unpack('>H', r[38:38+2])[0]
self.first_sidebar_page = struct.unpack('>H', r[36:36+2])[0]
# self.num_bookinfo_pages = struct.unpack('>H', r[34:34+2])[0]
# self.first_bookinfo_page = struct.unpack('>H', r[32:32+2])[0]
# self.num_chapter_pages = struct.unpack('>H', r[22:22+2])[0]
# self.first_chapter_page = struct.unpack('>H', r[20:20+2])[0]
# self.num_link_pages = struct.unpack('>H', r[30:30+2])[0]
# self.first_link_page = struct.unpack('>H', r[28:28+2])[0]
# self.num_xtextsize_pages = struct.unpack('>H', r[54:54+2])[0]
# self.first_xtextsize_page = struct.unpack('>H', r[52:52+2])[0]
# **before** data record 1 was decrypted and unshuffled, it contained data
# to create an XOR table and which is used to fix footnote record 0, link records, chapter records, etc
self.xortable_offset = struct.unpack('>H', r[40:40+2])[0]
self.xortable_size = struct.unpack('>H', r[42:42+2])[0]
self.xortable = self.data[self.xortable_offset:self.xortable_offset + self.xortable_size]
else:
# Nothing needs to be done
pass
# self.num_bookinfo_pages = 0
# self.num_chapter_pages = 0
# self.num_link_pages = 0
# self.num_xtextsize_pages = 0
# self.first_bookinfo_page = -1
# self.first_chapter_page = -1
# self.first_link_page = -1
# self.first_xtextsize_page = -1
logging.debug('self.num_text_pages %d', self.num_text_pages)
logging.debug('self.num_footnote_pages %d, self.first_footnote_page %d', self.num_footnote_pages , self.first_footnote_page)
logging.debug('self.num_sidebar_pages %d, self.first_sidebar_page %d', self.num_sidebar_pages , self.first_sidebar_page)
self.flags = struct.unpack('>L', r[4:8])[0]
reqd_flags = (1<<9) | (1<<7) | (1<<10)
if (self.flags & reqd_flags) != reqd_flags:
print("Flags: 0x%X" % self.flags)
raise ValueError('incompatible eReader file')
des = Des(fixKey(user_key))
if version == 259:
if drm_sub_version != 7:
raise ValueError('incorrect eReader version %d (error 3)' % drm_sub_version)
encrypted_key_sha = r[44:44+20]
encrypted_key = r[64:64+8]
elif version == 260:
if drm_sub_version != 13 and drm_sub_version != 11:
raise ValueError('incorrect eReader version %d (error 3)' % drm_sub_version)
if drm_sub_version == 13:
encrypted_key = r[44:44+8]
encrypted_key_sha = r[52:52+20]
else:
encrypted_key = r[64:64+8]
encrypted_key_sha = r[44:44+20]
elif version == 272:
encrypted_key = r[172:172+8]
encrypted_key_sha = r[56:56+20]
self.content_key = des.decrypt(encrypted_key)
if sha1(self.content_key).digest() != encrypted_key_sha:
raise ValueError('Incorrect Name and/or Credit Card')
def getNumImages(self):
return self.num_image_pages
def getImage(self, i):
sect = self.section_reader(self.first_image_page + i)
name = sect[4:4+32].strip(b'\0')
data = sect[62:]
return sanitizeFileName(name.decode('windows-1252')), data
# def getChapterNamePMLOffsetData(self):
# cv = ''
# if self.num_chapter_pages > 0:
# for i in xrange(self.num_chapter_pages):
# chaps = self.section_reader(self.first_chapter_page + i)
# j = i % self.xortable_size
# offname = deXOR(chaps, j, self.xortable)
# offset = struct.unpack('>L', offname[0:4])[0]
# name = offname[4:].strip('\0')
# cv += '%d|%s\n' % (offset, name)
# return cv
# def getLinkNamePMLOffsetData(self):
# lv = ''
# if self.num_link_pages > 0:
# for i in xrange(self.num_link_pages):
# links = self.section_reader(self.first_link_page + i)
# j = i % self.xortable_size
# offname = deXOR(links, j, self.xortable)
# offset = struct.unpack('>L', offname[0:4])[0]
# name = offname[4:].strip('\0')
# lv += '%d|%s\n' % (offset, name)
# return lv
# def getExpandedTextSizesData(self):
# ts = ''
# if self.num_xtextsize_pages > 0:
# tsize = deXOR(self.section_reader(self.first_xtextsize_page), 0, self.xortable)
# for i in xrange(self.num_text_pages):
# xsize = struct.unpack('>H', tsize[0:2])[0]
# ts += "%d\n" % xsize
# tsize = tsize[2:]
# return ts
# def getBookInfo(self):
# bkinfo = ''
# if self.num_bookinfo_pages > 0:
# info = self.section_reader(self.first_bookinfo_page)
# bkinfo = deXOR(info, 0, self.xortable)
# bkinfo = bkinfo.replace('\0','|')
# bkinfo += '\n'
# return bkinfo
def getText(self):
des = Des(fixKey(self.content_key))
r = b''
for i in range(self.num_text_pages):
logging.debug('get page %d', i)
r += zlib.decompress(des.decrypt(self.section_reader(1 + i)))
# now handle footnotes pages
if self.num_footnote_pages > 0:
r += '\n'
# the record 0 of the footnote section must pass through the Xor Table to make it useful
sect = self.section_reader(self.first_footnote_page)
fnote_ids = deXOR(sect, 0, self.xortable)
# the remaining records of the footnote sections need to be decoded with the content_key and zlib inflated
des = Des(fixKey(self.content_key))
for i in range(1,self.num_footnote_pages):
logging.debug('get footnotepage %d', i)
id_len = ord(fnote_ids[2])
id = fnote_ids[3:3+id_len]
fmarker = '<footnote id="%s">\n' % id
fmarker += zlib.decompress(des.decrypt(self.section_reader(self.first_footnote_page + i)))
fmarker += '\n</footnote>\n'
r += fmarker
fnote_ids = fnote_ids[id_len+4:]
# TODO: Handle dictionary index (?) pages - which are also marked as
# sidebar_pages (?). For now dictionary sidebars are ignored
# For dictionaries - record 0 is null terminated strings, followed by
# blocks of around 62000 bytes and a final block. Not sure of the
# encoding
# now handle sidebar pages
if self.num_sidebar_pages > 0:
r += '\n'
# the record 0 of the sidebar section must pass through the Xor Table to make it useful
sect = self.section_reader(self.first_sidebar_page)
sbar_ids = deXOR(sect, 0, self.xortable)
# the remaining records of the sidebar sections need to be decoded with the content_key and zlib inflated
des = Des(fixKey(self.content_key))
for i in range(1,self.num_sidebar_pages):
id_len = ord(sbar_ids[2])
id = sbar_ids[3:3+id_len]
smarker = '<sidebar id="%s">\n' % id
smarker += zlib.decompress(des.decrypt(self.section_reader(self.first_sidebar_page + i)))
smarker += '\n</sidebar>\n'
r += smarker
sbar_ids = sbar_ids[id_len+4:]
return r
def cleanPML(pml):
# Convert special characters to proper PML code. High ASCII start at (\x80, \a128) and go up to (\xff, \a255)
pml2 = pml
for k in range(128,256):
pml2 = pml2.replace(bytes([k]), b'\\a%03d' % k)
return pml2
def decryptBook(infile, outpath, make_pmlz, user_key):
bookname = os.path.splitext(os.path.basename(infile))[0]
if make_pmlz:
# outpath is actually pmlz name
pmlzname = outpath
outdir = tempfile.mkdtemp()
imagedirpath = os.path.join(outdir,"images")
else:
pmlzname = None
outdir = outpath
imagedirpath = os.path.join(outdir,bookname + "_img")
try:
if not os.path.exists(outdir):
os.makedirs(outdir)
print("Decoding File")
sect =Sectionizer(infile, b'PNRdPPrs')
er = EreaderProcessor(sect, user_key)
if er.getNumImages() > 0:
print("Extracting images")
if not os.path.exists(imagedirpath):
os.makedirs(imagedirpath)
for i in range(er.getNumImages()):
name, contents = er.getImage(i)
open(os.path.join(imagedirpath, name), 'wb').write(contents)
print("Extracting pml")
pml_string = er.getText()
pmlfilename = bookname + ".pml"
open(os.path.join(outdir, pmlfilename),'wb').write(cleanPML(pml_string))
if pmlzname is not None:
import zipfile
import shutil
print("Creating PMLZ file {0}".format(os.path.basename(pmlzname)))
myZipFile = zipfile.ZipFile(pmlzname,'w',zipfile.ZIP_STORED, False)
list = os.listdir(outdir)
for filename in list:
localname = filename
filePath = os.path.join(outdir,filename)
if os.path.isfile(filePath):
myZipFile.write(filePath, localname)
elif os.path.isdir(filePath):
imageList = os.listdir(filePath)
localimgdir = os.path.basename(filePath)
for image in imageList:
localname = os.path.join(localimgdir,image)
imagePath = os.path.join(filePath,image)
if os.path.isfile(imagePath):
myZipFile.write(imagePath, localname)
myZipFile.close()
# remove temporary directory
shutil.rmtree(outdir, True)
print("Output is {0}".format(pmlzname))
else:
print("Output is in {0}".format(outdir))
print("done")
except ValueError as e:
print("Error: {0}".format(e))
traceback.print_exc()
return 1
return 0
def usage():
print("Converts DRMed eReader books to PML Source")
print("Usage:")
print(" erdr2pml [options] infile.pdb [outpath] \"your name\" credit_card_number")
print(" ")
print("Options: ")
print(" -h prints this message")
print(" -p create PMLZ instead of source folder")
print(" --make-pmlz create PMLZ instead of source folder")
print(" ")
print("Note:")
print(" if outpath is ommitted, creates source in 'infile_Source' folder")
print(" if outpath is ommitted and pmlz option, creates PMLZ 'infile.pmlz'")
print(" if source folder created, images are in infile_img folder")
print(" if pmlz file created, images are in images folder")
print(" It's enough to enter the last 8 digits of the credit card number")
return
def getuser_key(name,cc):
newname = "".join(c for c in name.lower() if c >= 'a' and c <= 'z' or c >= '0' and c <= '9')
cc = cc.replace(" ","")
return struct.pack('>LL', binascii.crc32(bytes(newname.encode('utf-8'))) & 0xffffffff, binascii.crc32(bytes(cc[-8:].encode('utf-8'))) & 0xffffffff)
def cli_main():
print("eRdr2Pml v{0}. Copyright © 20092020 The Dark Reverser et al.".format(__version__))
argv=unicode_argv()
try:
opts, args = getopt.getopt(argv[1:], "hp", ["make-pmlz"])
except getopt.GetoptError as err:
print(err.args[0])
usage()
return 1
make_pmlz = False
for o, a in opts:
if o == "-h":
usage()
return 0
elif o == "-p":
make_pmlz = True
elif o == "--make-pmlz":
make_pmlz = True
if len(args)!=3 and len(args)!=4:
usage()
return 1
if len(args)==3:
infile, name, cc = args
if make_pmlz:
outpath = os.path.splitext(infile)[0] + ".pmlz"
else:
outpath = os.path.splitext(infile)[0] + "_Source"
elif len(args)==4:
infile, outpath, name, cc = args
print(binascii.b2a_hex(getuser_key(name,cc)))
return decryptBook(infile, outpath, make_pmlz, getuser_key(name,cc))
if __name__ == "__main__":
sys.stdout=SafeUnbuffered(sys.stdout)
sys.stderr=SafeUnbuffered(sys.stderr)
sys.exit(cli_main())