From 1b391da815cff190a86da497fc9c0991022c55df Mon Sep 17 00:00:00 2001 From: NoDRM Date: Wed, 17 Nov 2021 16:17:30 +0100 Subject: [PATCH] Add some more watermark removal code --- DeDRM_plugin/__init__.py | 77 ++++------- DeDRM_plugin/config.py | 7 + DeDRM_plugin/epubwatermark.py | 244 ++++++++++++++++++++++++++++++++++ DeDRM_plugin/prefs.py | 1 + 4 files changed, 275 insertions(+), 54 deletions(-) create mode 100644 DeDRM_plugin/epubwatermark.py diff --git a/DeDRM_plugin/__init__.py b/DeDRM_plugin/__init__.py index 515c356..e2f3464 100644 --- a/DeDRM_plugin/__init__.py +++ b/DeDRM_plugin/__init__.py @@ -90,12 +90,9 @@ PLUGIN_VERSION = ".".join([str(x)for x in PLUGIN_VERSION_TUPLE]) RESOURCE_NAME = PLUGIN_NAME + '_Help.htm' import codecs -import sys, os, re +import sys, os import time -import zipfile import traceback -from zipfile import ZipInfo, ZipFile, ZIP_STORED, ZIP_DEFLATED -from contextlib import closing class DeDRMError(Exception): @@ -211,55 +208,31 @@ class DeDRM(FileTypePlugin): # This is called after the DRM is removed (or if no DRM was present) # It does stuff like de-obfuscating fonts (by calling checkFonts) # or removing watermarks. - path_to_ebook = self.checkFonts(path_to_ebook) - path_to_ebook = self.removeCDPwatermarkFromEPUB(path_to_ebook) - return path_to_ebook - - def removeCDPwatermarkFromEPUB(self, path_to_ebook): - # "META-INF/cdp.info" is a watermark file used by some Tolino vendors. - # We don't want that in our eBooks, so lets remove that file. try: - infile = ZipFile(open(path_to_ebook, 'rb')) - namelist = infile.namelist() - if 'META-INF/cdp.info' not in namelist: + import calibre_plugins.dedrm.prefs as prefs + dedrmprefs = prefs.DeDRM_Prefs() + + if dedrmprefs["deobfuscate_fonts"] is True: + # Deobfuscate fonts + path_to_ebook = self.checkFonts(path_to_ebook) or path_to_ebook + + if dedrmprefs["remove_watermarks"] is True: + import calibre_plugins.dedrm.epubwatermark as watermark + + # Remove Tolino's CDP watermark file + path_to_ebook = watermark.removeCDPwatermark(self, path_to_ebook) or path_to_ebook + + # Remove watermarks (currently just Amazon) from the OPF file + path_to_ebook = watermark.removeOPFwatermarks(self, path_to_ebook) or path_to_ebook + + # Remove watermarks (currently just Adobe's resource ID) from all HTML and XHTML files + path_to_ebook = watermark.removeHTMLwatermarks(self, path_to_ebook) or path_to_ebook + return path_to_ebook - namelist.remove("mimetype") - namelist.remove("META-INF/cdp.info") - - output = self.temporary_file(".epub").name - - kwds = dict(compression=ZIP_DEFLATED, allowZip64=False) - with closing(ZipFile(open(output, 'wb'), 'w', **kwds)) as outf: - for path in (["mimetype"] + namelist): - - data = infile.read(path) - - zi = ZipInfo(path) - oldzi = infile.getinfo(path) - try: - zi.compress_type = oldzi.compress_type - if path == "mimetype": - zi.compress_type = ZIP_STORED - zi.date_time = oldzi.date_time - zi.comment = oldzi.comment - zi.extra = oldzi.extra - zi.internal_attr = oldzi.internal_attr - zi.external_attr = oldzi.external_attr - zi.create_system = oldzi.create_system - if any(ord(c) >= 128 for c in path) or any(ord(c) >= 128 for c in zi.comment): - # If the file name or the comment contains any non-ASCII char, set the UTF8-flag - zi.flag_bits |= 0x800 - except: - pass - - outf.writestr(zi, data) - - print("{0} v{1}: Successfully removed cdp.info watermark".format(PLUGIN_NAME, PLUGIN_VERSION)) - return output - except: + print("Error while checking settings") return path_to_ebook def checkFonts(self, path_to_ebook): @@ -267,10 +240,6 @@ class DeDRM(FileTypePlugin): # It checks if there's fonts that need to be deobfuscated try: - import calibre_plugins.dedrm.prefs as prefs - dedrmprefs = prefs.DeDRM_Prefs() - - if dedrmprefs["deobfuscate_fonts"] is True: import calibre_plugins.dedrm.epubfontdecrypt as epubfontdecrypt output = self.temporary_file(".epub").name @@ -283,10 +252,10 @@ class DeDRM(FileTypePlugin): else: print("{0} v{1}: Error during font deobfuscation".format(PLUGIN_NAME, PLUGIN_VERSION)) raise DeDRMError("Font deobfuscation failed") - else: - return path_to_ebook + except: print("{0} v{1}: Error during font deobfuscation".format(PLUGIN_NAME, PLUGIN_VERSION)) + traceback.print_exc() return path_to_ebook def ePubDecrypt(self,path_to_ebook): diff --git a/DeDRM_plugin/config.py b/DeDRM_plugin/config.py index fe2e443..7429a2a 100755 --- a/DeDRM_plugin/config.py +++ b/DeDRM_plugin/config.py @@ -83,6 +83,7 @@ class ConfigWidget(QWidget): self.tempdedrmprefs['adobewineprefix'] = self.dedrmprefs['adobewineprefix'] self.tempdedrmprefs['kindlewineprefix'] = self.dedrmprefs['kindlewineprefix'] self.tempdedrmprefs['deobfuscate_fonts'] = self.dedrmprefs['deobfuscate_fonts'] + self.tempdedrmprefs['remove_watermarks'] = self.dedrmprefs['remove_watermarks'] # Start Qt Gui dialog layout layout = QVBoxLayout(self) @@ -146,6 +147,11 @@ class ConfigWidget(QWidget): self.chkFontObfuscation.setChecked(self.tempdedrmprefs["deobfuscate_fonts"]) button_layout.addWidget(self.chkFontObfuscation) + self.chkRemoveWatermarks = QtGui.QCheckBox(_("Remove watermarks")) + self.chkRemoveWatermarks.setToolTip("Tries to remove watermarks from files") + self.chkRemoveWatermarks.setChecked(self.tempdedrmprefs["remove_watermarks"]) + button_layout.addWidget(self.chkRemoveWatermarks) + self.resize(self.sizeHint()) def kindle_serials(self): @@ -209,6 +215,7 @@ class ConfigWidget(QWidget): self.dedrmprefs.set('kindlewineprefix', self.tempdedrmprefs['kindlewineprefix']) self.dedrmprefs.set('configured', True) self.dedrmprefs.set('deobfuscate_fonts', self.chkFontObfuscation.isChecked()) + self.dedrmprefs.set('remove_watermarks', self.chkRemoveWatermarks.isChecked()) self.dedrmprefs.writeprefs() def load_resource(self, name): diff --git a/DeDRM_plugin/epubwatermark.py b/DeDRM_plugin/epubwatermark.py new file mode 100644 index 0000000..d84d95f --- /dev/null +++ b/DeDRM_plugin/epubwatermark.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# epubwatermark.py +# Copyright © 2021 NoDRM + +# Revision history: +# 1.0 - Initial version + +# Released under the terms of the GNU General Public Licence, version 3 +# + +""" +Removes various watermarks from EPUB files +""" + +import traceback +from zipfile import ZipInfo, ZipFile, ZIP_STORED, ZIP_DEFLATED +from contextlib import closing +from lxml import etree +import re + +# Runs a RegEx over all HTML/XHTML files to remove watermakrs. +def removeHTMLwatermarks(object, path_to_ebook): + try: + inf = ZipFile(open(path_to_ebook, 'rb')) + namelist = inf.namelist() + + modded_names = [] + modded_contents = [] + + for file in namelist: + if not (file.endswith('.html') or file.endswith('.xhtml')): + continue + + try: + file_str = inf.read(file).decode("utf-8") + str_new = file_str + + # Remove Adobe ADEPT watermarks + # Match optional newline at the beginning, then a "meta" tag with name = "Adept.expected.resource" or "Adept.resource" + # and either a "value" or a "content" element with an Adobe UUID + str_new = re.sub(r'((\r\n|\r|\n)\s*)?\', '', str_new) + str_new = re.sub(r'((\r\n|\r|\n)\s*)?\', '', str_new) + except: + traceback.print_exc() + continue + + if (file_str == str_new): + continue + + modded_names.append(file) + modded_contents.append(str_new) + + if len(modded_names) == 0: + # No file modified, return original + return path_to_ebook + + if len(modded_names) != len(modded_contents): + # Something went terribly wrong, return original + print("Watermark: Error during ADEPT watermark removal") + return path_to_ebook + + # Re-package with modified files: + namelist.remove("mimetype") + + try: + output = object.temporary_file(".epub").name + kwds = dict(compression=ZIP_DEFLATED, allowZip64=False) + with closing(ZipFile(open(output, 'wb'), 'w', **kwds)) as outf: + for path in (["mimetype"] + namelist): + + data = inf.read(path) + + try: + modded_index = None + modded_index = modded_names.index(path) + except: + pass + + if modded_index is not None: + # Found modified file - replace contents + data = modded_contents[modded_index] + + zi = ZipInfo(path) + oldzi = inf.getinfo(path) + try: + zi.compress_type = oldzi.compress_type + if path == "mimetype": + zi.compress_type = ZIP_STORED + zi.date_time = oldzi.date_time + zi.comment = oldzi.comment + zi.extra = oldzi.extra + zi.internal_attr = oldzi.internal_attr + zi.external_attr = oldzi.external_attr + zi.create_system = oldzi.create_system + if any(ord(c) >= 128 for c in path) or any(ord(c) >= 128 for c in zi.comment): + # If the file name or the comment contains any non-ASCII char, set the UTF8-flag + zi.flag_bits |= 0x800 + except: + pass + + outf.writestr(zi, data) + except: + traceback.print_exc() + return path_to_ebook + + except: + traceback.print_exc() + return path_to_ebook + + print("Watermark: Successfully stripped {0} ADEPT watermark(s) from ebook.".format(len(modded_names))) + return output + + +# Finds the main OPF file, then uses RegEx to remove watermarks +def removeOPFwatermarks(object, path_to_ebook): + contNS = lambda tag: '{%s}%s' % ('urn:oasis:names:tc:opendocument:xmlns:container', tag) + opf_path = None + + try: + inf = ZipFile(open(path_to_ebook, 'rb')) + container = etree.fromstring(inf.read("META-INF/container.xml")) + rootfiles = container.find(contNS("rootfiles")).findall(contNS("rootfile")) + for rootfile in rootfiles: + opf_path = rootfile.get("full-path", None) + if (opf_path is not None): + break + except: + traceback.print_exc() + return path_to_ebook + + # If path is None, we didn't find an OPF, so we probably don't have a font key. + # If path is set, it's the path to the main content OPF file. + + if (opf_path is None): + # No OPF found - no watermark + return path_to_ebook + else: + try: + container_str = inf.read(opf_path).decode("utf-8") + container_str_new = container_str + + # Remove Amazon hex watermarks + # Match optional newline at the beginning, then spaces, then a "meta" tag with name = "Watermark" or "Watermark_(hex)" and a "content" element. + container_str_new = re.sub(r'((\r\n|\r|\n)\s*)?\', '', container_str_new) + container_str_new = re.sub(r'((\r\n|\r|\n)\s*)?\', '', container_str_new) + except: + traceback.print_exc() + return path_to_ebook + + if (container_str == container_str_new): + # container didn't change - no watermark + return path_to_ebook + + # Re-package without watermark + namelist = inf.namelist() + namelist.remove("mimetype") + + try: + output = object.temporary_file(".epub").name + kwds = dict(compression=ZIP_DEFLATED, allowZip64=False) + with closing(ZipFile(open(output, 'wb'), 'w', **kwds)) as outf: + for path in (["mimetype"] + namelist): + + data = inf.read(path) + if path == opf_path: + # Found OPF, replacing ... + data = container_str_new + + zi = ZipInfo(path) + oldzi = inf.getinfo(path) + try: + zi.compress_type = oldzi.compress_type + if path == "mimetype": + zi.compress_type = ZIP_STORED + zi.date_time = oldzi.date_time + zi.comment = oldzi.comment + zi.extra = oldzi.extra + zi.internal_attr = oldzi.internal_attr + zi.external_attr = oldzi.external_attr + zi.create_system = oldzi.create_system + if any(ord(c) >= 128 for c in path) or any(ord(c) >= 128 for c in zi.comment): + # If the file name or the comment contains any non-ASCII char, set the UTF8-flag + zi.flag_bits |= 0x800 + except: + pass + + outf.writestr(zi, data) + except: + traceback.print_exc() + return path_to_ebook + + print("Watermark: Successfully stripped Amazon watermark from OPF file.") + return output + + + +def removeCDPwatermark(object, path_to_ebook): + # "META-INF/cdp.info" is a watermark file used by some Tolino vendors. + # We don't want that in our eBooks, so lets remove that file. + try: + infile = ZipFile(open(path_to_ebook, 'rb')) + namelist = infile.namelist() + if 'META-INF/cdp.info' not in namelist: + return path_to_ebook + + namelist.remove("mimetype") + namelist.remove("META-INF/cdp.info") + + output = object.temporary_file(".epub").name + + kwds = dict(compression=ZIP_DEFLATED, allowZip64=False) + with closing(ZipFile(open(output, 'wb'), 'w', **kwds)) as outf: + for path in (["mimetype"] + namelist): + + data = infile.read(path) + + zi = ZipInfo(path) + oldzi = infile.getinfo(path) + try: + zi.compress_type = oldzi.compress_type + if path == "mimetype": + zi.compress_type = ZIP_STORED + zi.date_time = oldzi.date_time + zi.comment = oldzi.comment + zi.extra = oldzi.extra + zi.internal_attr = oldzi.internal_attr + zi.external_attr = oldzi.external_attr + zi.create_system = oldzi.create_system + if any(ord(c) >= 128 for c in path) or any(ord(c) >= 128 for c in zi.comment): + # If the file name or the comment contains any non-ASCII char, set the UTF8-flag + zi.flag_bits |= 0x800 + except: + pass + + outf.writestr(zi, data) + + print("Watermark: Successfully removed cdp.info watermark") + return output + + except: + traceback.print_exc() + return path_to_ebook \ No newline at end of file diff --git a/DeDRM_plugin/prefs.py b/DeDRM_plugin/prefs.py index e1d8cc6..ebfb6b2 100755 --- a/DeDRM_plugin/prefs.py +++ b/DeDRM_plugin/prefs.py @@ -20,6 +20,7 @@ class DeDRM_Prefs(): self.dedrmprefs.defaults['configured'] = False self.dedrmprefs.defaults['deobfuscate_fonts'] = True + self.dedrmprefs.defaults['remove_watermarks'] = False self.dedrmprefs.defaults['bandnkeys'] = {} self.dedrmprefs.defaults['adeptkeys'] = {} self.dedrmprefs.defaults['ereaderkeys'] = {}