topazscripts 1.6

2010-01-21 12:14:31 +00:00 · 2010-01-21 12:14:31 +00:00 · 58e9c973ab
parent a1fec0b54d
commit 58e9c973ab
4 changed files with 598 additions and 90 deletions
--- a/Topaz_Tools/lib/changes.txt
+++ b/Topaz_Tools/lib/changes.txt
@ -1,3 +1,8 @@
+Changes in version 1.6
+	- support for books whose paragraphs have no styles
+	- support to run cmbtc_dump on Linux and Mac OSX provided you know your PID of your ipod or standalone Kindle
+	 (contributed by DiapDealer)
+
 Changes in version 1.5
 	- completely reworked generation of styles to use actual page heights and widths
 	- added new script getpagedim.py to support the above
--- a/Topaz_Tools/lib/cmbtc_dump_mac_linux.py
+++ b/Topaz_Tools/lib/cmbtc_dump_mac_linux.py
@ -0,0 +1,504 @@
+#! /usr/bin/python
+
+from __future__ import with_statement
+
+import csv
+import sys
+import os
+import getopt
+import zlib
+from struct import pack
+from struct import unpack
+
+MAX_PATH = 255
+
+# Put the first 8 characters of your Kindle PID here
+# or supply it with the -p option in the command line
+####################################################
+kindlePID = "12345678"
+####################################################
+
+global bookFile
+global bookPayloadOffset
+global bookHeaderRecords
+global bookMetadata
+global bookKey
+global command
+
+#
+# Exceptions for all the problems that might happen during the script
+#
+
+class CMBDTCError(Exception):
+    pass
+    
+class CMBDTCFatal(Exception):
+    pass
+    
+
+#
+# Open the book file at path
+#
+
+def openBook(path):
+    try:
+        return open(path,'rb')
+    except:
+        raise CMBDTCFatal("Could not open book file: " + path)
+
+#
+# Get a 7 bit encoded number from the book file
+#
+
+def bookReadEncodedNumber():
+    flag = False
+    data = ord(bookFile.read(1))
+    
+    if data == 0xFF:
+       flag = True
+       data = ord(bookFile.read(1))
+       
+    if data >= 0x80:
+        datax = (data & 0x7F)
+        while data >= 0x80 :
+            data = ord(bookFile.read(1))
+            datax = (datax <<7) + (data & 0x7F)
+        data = datax 
+    
+    if flag:
+       data = -data
+    return data
+    
+#
+# Encode a number in 7 bit format
+#
+
+def encodeNumber(number):
+   result = ""
+   negative = False
+   flag = 0
+   print("Using encodeNumber routine")
+   
+   if number < 0 :
+       number = -number + 1
+       negative = True
+   
+   while True:
+       byte = number & 0x7F
+       number = number >> 7
+       byte += flag
+       result += chr(byte)
+       flag = 0x80
+       if number == 0 : break
+   
+   if negative:
+       result += chr(0xFF)
+   
+   return result[::-1]
+  
+#
+# Get a length prefixed string from the file 
+#
+
+def bookReadString():
+    stringLength = bookReadEncodedNumber()
+    return unpack(str(stringLength)+"s",bookFile.read(stringLength))[0]  
+    
+#
+# Returns a length prefixed string
+#
+
+def lengthPrefixString(data):
+    return encodeNumber(len(data))+data
+    
+
+#
+# Read and return the data of one header record at the current book file position [[offset,decompressedLength,compressedLength],...]
+#
+    
+def bookReadHeaderRecordData():
+    nbValues = bookReadEncodedNumber()
+    values = []
+    for i in range (0,nbValues):
+        values.append([bookReadEncodedNumber(),bookReadEncodedNumber(),bookReadEncodedNumber()])
+    return values
+   
+#
+# Read and parse one header record at the current book file position and return the associated data [[offset,decompressedLength,compressedLength],...]
+#
+
+def parseTopazHeaderRecord():
+    if ord(bookFile.read(1)) != 0x63:
+        raise CMBDTCFatal("Parse Error : Invalid Header")
+    
+    tag = bookReadString()
+    record = bookReadHeaderRecordData()
+    return [tag,record]
+
+#
+# Parse the header of a Topaz file, get all the header records and the offset for the payload
+#
+ 
+def parseTopazHeader():
+    global bookHeaderRecords
+    global bookPayloadOffset
+    magic = unpack("4s",bookFile.read(4))[0]
+    
+    if magic != 'TPZ0':
+        raise CMBDTCFatal("Parse Error : Invalid Header, not a Topaz file")
+        
+    nbRecords = bookReadEncodedNumber()
+    bookHeaderRecords = {}
+   
+    for i in range (0,nbRecords):
+        result = parseTopazHeaderRecord()
+        print result[0], result[1]
+        bookHeaderRecords[result[0]] = result[1]
+    
+    if ord(bookFile.read(1))  != 0x64 :
+        raise CMBDTCFatal("Parse Error : Invalid Header")
+    
+    bookPayloadOffset = bookFile.tell()
+   
+#
+# Get a record in the book payload, given its name and index. If necessary the record is decrypted. The record is not decompressed
+# Correction, the record is correctly decompressed too
+#
+
+def getBookPayloadRecord(name, index):   
+    encrypted = False
+    compressed = False
+
+    try: 
+        recordOffset = bookHeaderRecords[name][index][0]
+    except:
+        raise CMBDTCFatal("Parse Error : Invalid Record, record not found")
+    
+    bookFile.seek(bookPayloadOffset + recordOffset)
+    
+    tag = bookReadString()
+    if tag != name :
+        raise CMBDTCFatal("Parse Error : Invalid Record, record name doesn't match")
+    
+    recordIndex = bookReadEncodedNumber()
+    
+    if recordIndex < 0 :
+        encrypted = True
+        recordIndex = -recordIndex -1
+    
+    if recordIndex != index :
+      raise CMBDTCFatal("Parse Error : Invalid Record, index doesn't match")
+            
+    if (bookHeaderRecords[name][index][2] > 0):
+        compressed = True
+        record = bookFile.read(bookHeaderRecords[name][index][2])
+    else:
+        record = bookFile.read(bookHeaderRecords[name][index][1])
+ 
+    if encrypted:
+       ctx = topazCryptoInit(bookKey)
+       record = topazCryptoDecrypt(record,ctx)
+
+    if compressed:
+        record = zlib.decompress(record)
+    
+    return record
+
+#
+# Extract, decrypt and decompress a book record indicated by name and index and print it or save it in "filename"
+#
+
+def extractBookPayloadRecord(name, index, filename):
+    compressed = False
+
+    try:
+        compressed = bookHeaderRecords[name][index][2] != 0
+        record = getBookPayloadRecord(name,index)
+    except:
+        print("Could not find record")
+    
+    # if compressed:
+    #    try:
+    #        record = zlib.decompress(record)
+    #    except:
+    #        raise CMBDTCFatal("Could not decompress record")
+            
+    if filename != "":
+        try:
+            file = open(filename,"wb")
+            file.write(record)
+            file.close()
+        except:
+            raise CMBDTCFatal("Could not write to destination file")
+    else:
+        print(record)
+    
+#
+# return next record [key,value] from the book metadata from the current book position
+#  
+
+def readMetadataRecord():
+    return [bookReadString(),bookReadString()]
+    
+#
+# Parse the metadata record from the book payload and return a list of [key,values]
+#
+
+def parseMetadata():
+    global bookHeaderRecords
+    global bookPayloadAddress
+    global bookMetadata
+    bookMetadata = {}
+    bookFile.seek(bookPayloadOffset + bookHeaderRecords["metadata"][0][0])
+    tag = bookReadString()
+    if tag != "metadata" :
+        raise CMBDTCFatal("Parse Error : Record Names Don't Match")
+    
+    flags = ord(bookFile.read(1))
+    nbRecords = ord(bookFile.read(1))
+    
+    for i in range (0,nbRecords) :
+        record =readMetadataRecord()
+        bookMetadata[record[0]] = record[1]
+
+#
+# Context initialisation for the Topaz Crypto
+#
+
+def topazCryptoInit(key):
+    ctx1 = 0x0CAFFE19E
+    
+    for keyChar in key:
+        keyByte = ord(keyChar)
+        ctx2 = ctx1 
+        ctx1 = ((((ctx1 >>2) * (ctx1 >>7))&0xFFFFFFFF) ^ (keyByte * keyByte * 0x0F902007)& 0xFFFFFFFF )
+    return [ctx1,ctx2]
+    
+#
+# decrypt data with the context prepared by topazCryptoInit()
+#
+    
+def topazCryptoDecrypt(data, ctx):
+    ctx1 = ctx[0]
+    ctx2 = ctx[1]
+    
+    plainText = ""
+    
+    for dataChar in data:
+        dataByte = ord(dataChar)
+        m = (dataByte ^ ((ctx1 >> 3) &0xFF) ^ ((ctx2<<3) & 0xFF)) &0xFF
+        ctx2 = ctx1
+        ctx1 = (((ctx1 >> 2) * (ctx1 >> 7)) &0xFFFFFFFF) ^((m * m * 0x0F902007) &0xFFFFFFFF)
+        plainText += chr(m)
+        
+    return plainText
+
+#
+# Decrypt a payload record with the PID
+#
+
+def decryptRecord(data,PID):
+    ctx = topazCryptoInit(PID)
+    return topazCryptoDecrypt(data, ctx)
+
+#
+# Try to decrypt a dkey record (contains the book PID)
+#
+
+def decryptDkeyRecord(data,PID):
+    record = decryptRecord(data,PID)
+    fields = unpack("3sB8sB8s3s",record)
+    
+    if fields[0] != "PID" or fields[5] != "pid" :
+        raise CMBDTCError("Didn't find PID magic numbers in record")
+    elif fields[1] != 8 or fields[3] != 8 :
+        raise CMBDTCError("Record didn't contain correct length fields")
+    elif fields[2] != PID :
+        raise CMBDTCError("Record didn't contain PID")
+    
+    return fields[4]
+    
+#
+# Decrypt all the book's dkey records (contain the book PID)
+#
+  
+def decryptDkeyRecords(data,PID):
+    nbKeyRecords = ord(data[0])
+    records = []
+    data = data[1:]
+    for i in range (0,nbKeyRecords):
+        length = ord(data[0])
+        try:
+            key = decryptDkeyRecord(data[1:length+1],PID)
+            records.append(key)
+        except CMBDTCError:
+            pass
+        data = data[1+length:]
+        
+    return records
+    
+#
+# Create decrypted book payload
+#
+
+def createDecryptedPayload(payload):
+    for headerRecord in bookHeaderRecords:
+       name = headerRecord
+       if name != "dkey" :
+           ext = '.dat'
+           if name == 'img' : ext = '.jpg'
+           for index in range (0,len(bookHeaderRecords[name])) :
+               fnum = "%04d" % index
+               fname = name + fnum + ext
+               destdir = payload
+               if name == 'img':
+                   destdir =  os.path.join(payload,'img')
+               if name == 'page':
+                   destdir =  os.path.join(payload,'page')
+               if name == 'glyphs':
+                   destdir =  os.path.join(payload,'glyphs')
+               outputFile = os.path.join(destdir,fname)
+               file(outputFile, 'wb').write(getBookPayloadRecord(name, index))
+                   
+
+# Create decrypted book
+#
+
+def createDecryptedBook(outdir):
+    if not os.path.exists(outdir):
+        os.makedirs(outdir)
+
+    destdir =  os.path.join(outdir,'img')
+    if not os.path.exists(destdir):
+        os.makedirs(destdir)
+
+    destdir =  os.path.join(outdir,'page')
+    if not os.path.exists(destdir):
+        os.makedirs(destdir)
+
+    destdir =  os.path.join(outdir,'glyphs')
+    if not os.path.exists(destdir):
+        os.makedirs(destdir)
+
+    createDecryptedPayload(outdir)
+
+
+#
+# Set the command to execute by the programm according to cmdLine parameters
+#
+
+def setCommand(name) :
+    global command
+    if command != "" :
+         raise CMBDTCFatal("Invalid command line parameters")
+    else :
+        command = name
+
+# 
+# Program usage
+#
+   
+def usage():
+    print("\nUsage:")
+    print("\ncmbtc_dump_linux.py [options] bookFileName\n")
+    print("-p Adds a PID to the list of PIDs that are tried to decrypt the book key (can be used several times)")
+    print("-d Dumps the unencrypted book as files to outdir")
+    print("-o Output directory to save book files to")
+    print("-v Verbose (can be used several times)")
+
+ 
+#
+# Main
+#   
+
+def main(argv=sys.argv):
+    global bookMetadata
+    global bookKey
+    global bookFile
+    global command
+    
+    progname = os.path.basename(argv[0])
+    
+    verbose = 0
+    recordName = ""
+    recordIndex = 0
+    outdir = ""
+    PIDs = []
+    command = ""
+    
+    # Preloads your Kindle pid from the top of the program.
+    PIDs.append(kindlePID)
+    
+    try:
+        opts, args = getopt.getopt(sys.argv[1:], "vo:p:d")
+    except getopt.GetoptError, err:
+        # print help information and exit:
+        print str(err) # will print something like "option -a not recognized"
+        usage()
+        sys.exit(2)
+    
+    if len(opts) == 0 and len(args) == 0 :
+        usage()
+        sys.exit(2) 
+       
+    for o, a in opts:
+        if o == "-v":
+            verbose+=1
+        if o =="-o":
+            if a == None :
+                raise CMBDTCFatal("Invalid parameter for -o")
+            outdir = a
+        if o =="-p":
+            PIDs.append(a)
+        if o =="-d":
+            setCommand("doit")
+            
+    if command == "" :
+        raise CMBDTCFatal("No action supplied on command line")
+   
+    #
+    # Open book and parse metadata
+    #
+        
+    if len(args) == 1:
+    
+        bookFile = openBook(args[0])
+        parseTopazHeader()
+        parseMetadata()
+    
+    #
+    #  Decrypt book key
+    #
+    
+        dkey = getBookPayloadRecord('dkey', 0) 
+        
+        bookKeys = []
+        for PID in PIDs :
+            bookKeys+=decryptDkeyRecords(dkey,PID)
+            
+        if len(bookKeys) == 0 :
+            if verbose > 0 :
+                print ("Book key could not be found. Maybe this book is not registered with this device.")
+        else :
+            bookKey = bookKeys[0]
+            if verbose > 0:
+                print("Book key: " + bookKey.encode('hex'))
+                
+            
+                  
+            if command == "printRecord" :
+                extractBookPayloadRecord(recordName,int(recordIndex),outputFile)
+                if outputFile != "" and verbose>0 :
+                    print("Wrote record to file: "+outputFile) 
+            elif command == "doit" :
+                if outdir != "" :
+                    createDecryptedBook(outdir)
+                    if verbose >0 :
+                        print ("Decrypted book saved. Don't pirate!")
+                elif verbose > 0:
+                    print("Output directory name was not supplied.")
+    
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())
--- a/Topaz_Tools/lib/flatxml2html.py
+++ b/Topaz_Tools/lib/flatxml2html.py
@ -13,7 +13,8 @@ from struct import unpack
 class DocParser(object):
    def __init__(self, flatxml, classlst, fileid):
        self.id = os.path.basename(fileid).replace('.dat','')
-        self.flatdoc = flatxml.split('\n')
+        self.docList = flatxml.split('\n')
+        self.docSize = len(self.docList)
        self.classList = {}
        tmpList = classlst.split('\n')
        for pclass in tmpList:
@ -29,12 +30,10 @@ class DocParser(object):
        self.paracont_stemid = []
        self.parastems_stemid = []

-    # find tag if within pos to end inclusive
+    # return tag at line pos in document
    def lineinDoc(self, pos) :
-        docList = self.flatdoc
-        cnt = len(docList)
-        if (pos >= 0) and (pos < cnt) :
-            item = docList[pos]
+        if (pos >= 0) and (pos < self.docSize) :
+            item = self.docList[pos]
            if item.find('=') >= 0:
                (name, argres) = item.split('=',1)
            else : 
@ -43,20 +42,18 @@ class DocParser(object):
        return name, argres

        
-    # find tag if within pos to end inclusive
+    # find tag in doc if within pos to end inclusive
    def findinDoc(self, tagpath, pos, end) :
        result = None
-        docList = self.flatdoc
-        cnt = len(docList)
        if end == -1 :
-            end = cnt
+            end = self.docSize
        else:
-            end = min(cnt,end)
+            end = min(self.docSize, end)
        foundat = -1
        for j in xrange(pos, end):
-            item = docList[j]
+            item = self.docList[j]
            if item.find('=') >= 0:
-                (name, argres) = item.split('=')
+                (name, argres) = item.split('=',1)
            else : 
                name = item
                argres = ''
@ -85,7 +82,7 @@ class DocParser(object):

        result = []

-        # normal paragraph
+        # paragraph
        (pos, pclass) = self.findinDoc('paragraph.class',start,end) 

        # class names are an issue given topaz may start them with numerals (not allowed),
@ -94,19 +91,20 @@ class DocParser(object):
        # from a base class (but then not actually provide all of these _reclustereed 
        # classes in the stylesheet!

-        # so we clean this up by lowercasing, prepend 'cl_', and getting any baseclass
+        # so we clean this up by lowercasing, prepend 'cl-', and getting any baseclass
        # that exists in the stylesheet first, and then adding this specific class
        # after
-        classres = ''
-        pclass = pclass.lower()
-        pclass = 'cl-' + pclass
-        p = pclass.find('_')
-        if p > 0 :
-            baseclass = pclass[0:p]
-            if baseclass in self.classList:
-                classres += baseclass + ' '
-        classres += pclass
-        pclass = classres
+        if pclass != None :
+            classres = ''
+            pclass = pclass.lower()
+            pclass = 'cl-' + pclass
+            p = pclass.find('_')
+            if p > 0 :
+                baseclass = pclass[0:p]
+                if baseclass in self.classList:
+                    classres += baseclass + ' '
+            classres += pclass
+            pclass = classres

        # build up a description of the paragraph in result and return it
        # first check for the  basic - all words paragraph
@ -128,9 +126,7 @@ class DocParser(object):

        # if end is -1 then we must search to end of document
        if end == -1 :
-            docList = self.flatdoc
-            cnt = len(docList)
-            end = cnt
+            end = self.docSize

        while (line < end) :

@ -171,20 +167,20 @@ class DocParser(object):
        return pclass, result
                            

-    def buildParagraph(self, cname, pdesc, type, regtype) :
+    def buildParagraph(self, pclass, pdesc, type, regtype) :
        parares = ''
        sep =''

-        br_lb = False
-        if (regtype == 'fixed') or (regtype == 'chapterheading'):
-            br_lb = True
+        classres = ''
+        if pclass :
+            classres = ' class="' + pclass + '"'

-        handle_links = False
-        if len(self.link_id) > 0:
-            handle_links = True
+        br_lb = (regtype == 'fixed') or (regtype == 'chapterheading')

+        handle_links = len(self.link_id) > 0
+        
        if (type == 'full') or (type == 'begin') :
-            parares += '<p class="' + cname + '">'
+            parares += '<p' + classres + '>'

        if (type == 'end'):
            parares += ' '
@ -218,10 +214,7 @@ class DocParser(object):
                        if word == '_link_' : word = ''

                if word == '_lb_':
-                    if (num-1) in self.dehyphen_rootid :
-                        word = ''
-                        sep = ''
-                    elif handle_links :
+                    if ((num-1) in self.dehyphen_rootid ) or handle_links:
                        word = ''
                        sep = ''
                    elif br_lb :
@ -261,43 +254,51 @@ class DocParser(object):

        htmlpage = ''

-        # first collect information from the xml doc that describes this page
+        # get the ocr text
        (pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
        if argres :  self.ocrtext = argres.split('|')

+        # get information to dehyphenate the text
        (pos, argres) = self.findinDoc('info.dehyphen.rootID',0,-1)
        if argres: 
            argList = argres.split('|')
            self.dehyphen_rootid = [ int(strval) for strval in argList]

+        # determine if first paragraph is continued from previous page
        (pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1)
-        if self.parastems_stemid == None : self.parastems_stemid = []
- 
+        first_para_continued = (self.parastems_stemid  != None) 
+        
+        # determine if last paragraph is continued onto the next page
        (pos, self.paracont_stemid) = self.findinDoc('info.paraCont.stemID',0,-1)
-        if self.paracont_stemid == None : self.paracont_stemid = []
-
+        last_para_continued = (self.paracont_stemid != None)

+        # collect link ids
        (pos, argres) = self.findinDoc('info.word.link_id',0,-1)
        if argres:
            argList = argres.split('|')
            self.link_id = [ int(strval) for strval in argList]

+        # collect link destination page numbers
        (pos, argres) = self.findinDoc('info.links.page',0,-1)
        if argres :
            argList = argres.split('|')
            self.link_page = [ int(strval) for strval in argList]

+        # collect link titles
        (pos, argres) = self.findinDoc('info.links.title',0,-1)
        if argres :
            self.link_title = argres.split('|')
        else:
            self.link_title.append('')

+
+        # get page type
        (pos, pagetype) = self.findinDoc('page.type',0,-1)


        # generate a list of each region starting point
        # each region has one paragraph,, or one image, or one chapterheading
+
        regionList= self.posinDoc('region')
        regcnt = len(regionList)
        regionList.append(-1)
@ -308,47 +309,48 @@ class DocParser(object):
        # process each region tag and convert what you can to html

        for j in xrange(regcnt):
+
            start = regionList[j]
            end = regionList[j+1]

            (pos, regtype) = self.findinDoc('region.type',start,end)

+            # set anchor for link target on this page
+            if not anchorSet and not first_para_continued:
+                htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
+                anchorSet = True
+
            if regtype == 'graphic' :
-                if not anchorSet:
-                    htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
-                    anchorSet = True
                (pos, simgsrc) = self.findinDoc('img.src',start,end)
                if simgsrc:
                    htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
+
            
            elif regtype == 'chapterheading' :
                (pclass, pdesc) = self.getParaDescription(start,end)
                if not breakSet:
                    htmlpage += '<div style="page-break-after: always;">&nbsp;</div>\n'
                    breakSet = True
-                if not anchorSet:
-                    htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
-                    anchorSet = True
                tag = 'h1'
-                if pclass[3:7] == 'ch1-' : tag = 'h1'
-                if pclass[3:7] == 'ch2-' : tag = 'h2'
-                if pclass[3:7] == 'ch3-' : tag = 'h3'
-                htmlpage += '<' + tag + ' class="' + pclass + '">'
+                if pclass and (len(pclass) >= 7):
+                    if pclass[3:7] == 'ch1-' : tag = 'h1'
+                    if pclass[3:7] == 'ch2-' : tag = 'h2'
+                    if pclass[3:7] == 'ch3-' : tag = 'h3'
+                    htmlpage += '<' + tag + ' class="' + pclass + '">'
+                else:
+                    htmlpage += '<' + tag + '>'
                htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
                htmlpage += '</' + tag + '>'

+
            elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem') :
                ptype = 'full'
                # check to see if this is a continution from the previous page
-                if (len(self.parastems_stemid) > 0):
+                if first_para_continued :
                    ptype = 'end'
-                    self.parastems_stemid=[]
-                else:
-                    if not anchorSet:
-                        htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
-                        anchorSet = True
+                    first_para_continued = False
                (pclass, pdesc) = self.getParaDescription(start,end)
-                if ptype == 'full' :
+                if pclass and (len(pclass) >= 6) and (ptype == 'full'):
                    tag = 'p'
                    if pclass[3:6] == 'h1-' : tag = 'h4'
                    if pclass[3:6] == 'h2-' : tag = 'h5'
@ -359,28 +361,22 @@ class DocParser(object):
                else :
                    htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)

+
            elif (regtype == 'tocentry') :
                ptype = 'full'
-                # check to see if this is a continution from the previous page
-                if (len(self.parastems_stemid) > 0) and (j == 0):
-                    # process the first paragraph as a continuation from the last page
+                if first_para_continued :
                    ptype = 'end'
-                    self.parastems_stemid = []
-                else:
-                    if not anchorSet:
-                        htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
-                        anchorSet = True
+                    first_para_continued = False
                (pclass, pdesc) = self.getParaDescription(start,end)
                htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)

+
            elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'):
-                if not anchorSet:
-                    htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
-                    anchorSet = True
                (pos, simgsrc) = self.findinDoc('img.src',start,end)
                if simgsrc:
                    htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)

+
            else :
                print 'Warning: Unknown region type', regtype
                (pos, temp) = self.findinDoc('paragraph',start,end)
@ -389,15 +385,11 @@ class DocParser(object):
                    regtype = 'fixed'
                    ptype = 'full'
                    # check to see if this is a continution from the previous page
-                    if (len(self.parastems_stemid) > 0):
+                    if first_para_continued :
                        ptype = 'end'
-                        self.parastems_stemid=[]
-                    else:
-                        if not anchorSet:
-                            htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
-                            anchorSet = True
+                        first_para_continued = False
                    (pclass, pdesc) = self.getParaDescription(start,end)
-                    if ptype == 'full' :
+                    if pclass and (ptype == 'full') and (len(pclass) >= 6):
                        tag = 'p'
                        if pclass[3:6] == 'h1-' : tag = 'h4'
                        if pclass[3:6] == 'h2-' : tag = 'h5'
@ -408,24 +400,20 @@ class DocParser(object):
                    else :
                        htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
                else :
-                    print 'Treating this like a "image" region'
-                    if not anchorSet:
-                        htmlpage += '<div id="' + self.id + '" class="page_' + pagetype + '">&nbsp</div>\n'
-                        anchorSet = True
+                    print 'Treating this like a "graphic" region'
                    (pos, simgsrc) = self.findinDoc('img.src',start,end)
                    if simgsrc:
                        htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)

-        if len(self.paracont_stemid) > 0 :
+
+        if last_para_continued :
            if htmlpage[-4:] == '</p>':
-                htmlpage = htmlpage[0:-4]    
+                htmlpage = htmlpage[0:-4]
+            last_para_continued = False

        return htmlpage


-        return self.convert2HTML()
-
-

 def convert2HTML(flatxml, classlst, fileid):

--- a/Topaz_Tools/lib/readme.txt
+++ b/Topaz_Tools/lib/readme.txt
@ -3,7 +3,7 @@ Contributors:
     clarknova - for all of the svg and glyph generation and many other bug fixes and improvements
     skindle - for figuing out the general case for the mode loops
     some updates -  for conversion to xml, basic html
-     DiapDealer - for extensive testing and feedback
+     DiapDealer - for extensive testing and feedback, and standalone linux/macosx version of cmbtc_dump
     stewball - for extensive testing and feedback

 and others for posting, feedback and testing
@ -29,6 +29,17 @@ genxml.py - main program to convert everything to xml
 genhtml.py - main program to generate "book.html"
 gensvg.py - (author: clarknova) main program to create an svg grpahic of each page

+
+In addition there is now a new file:
+
+cmbtc_dump_mac_linux.py  
+
+If you know the pid of your ipod and/or your standalone Kindle and your book
+was meant for that device, you can use this program to dump the proper sections
+on Mac OSX and Linux (and even Windows if you do not have Kindle4PC installed).
+Thank DiapDealer for creating it!
+
+
 Please note, gensvg.py, genhtml.py, and genxml.py import and use
 decode_meta.py, convert2xml.py, flatxml2html.py, getpagedim.py and stylexml2css.py 
 so please keep all of these python scripts together in the same place.