tools v1.5

2010-03-02 12:46:56 +00:00 · 2010-03-02 12:46:56 +00:00 · 8e7d2657a4
parent 6fb13373cf
commit 8e7d2657a4
12 changed files with 98 additions and 43 deletions
--- a/Topaz_Tools/lib/cmbtc_dump.py
+++ b/Topaz_Tools/lib/cmbtc_dump.py
@ -1,5 +1,5 @@
 #! /usr/bin/python
-# For use in Topaz Scripts version 2.3
+# For use in Topaz Scripts version 2.6

 """

--- a/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py
+++ b/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py
@ -1,5 +1,5 @@
 #!/usr/bin/python
-# For use with Topaz Scripts Version 2.3
+# For use with Topaz Scripts Version 2.6

 class Unbuffered:
    def __init__(self, stream):
--- a/Topaz_Tools/lib/convert2xml.py
+++ b/Topaz_Tools/lib/convert2xml.py
@ -1,6 +1,6 @@
 #! /usr/bin/python
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
-# For use with Topaz Scripts Version 2.4
+# For use with Topaz Scripts Version 2.6

 class Unbuffered:
    def __init__(self, stream):
@ -315,6 +315,12 @@ class PageParser(object):
        'version.findlists'                : (1, 'scalar_text', 0, 0),
        'version.page_num'                 : (1, 'scalar_text', 0, 0),
        'version.page_type'                : (1, 'scalar_text', 0, 0),
+        'version.bad_text'                 : (1, 'scalar_text', 0, 0),
+        'version.glyph_mismatch'           : (1, 'scalar_text', 0, 0),
+        'version.margins'                  : (1, 'scalar_text', 0, 0),
+        'version.staggered_lines'          : (1, 'scalar_text', 0, 0),
+        'version.paragraph_continuation'   : (1, 'scalar_text', 0, 0),
+        'version.toc'                      : (1, 'scalar_text', 0, 0),

        'stylesheet'   : (1, 'snippets', 1, 0),
        'style'              : (1, 'snippets', 1, 0),
@ -662,16 +668,19 @@ class PageParser(object):
    def process(self):

        # peek at the first bytes to see what type of file it is
-        magic = self.fo.read(11)
-        if (magic[0:1] == 'p') and (magic[2:10] == '__PAGE__'):
+        magic = self.fo.read(9)
+        if (magic[0:1] == 'p') and (magic[2:9] == 'marker_'):
            first_token = 'info'
-        elif (magic[0:1] == 'g') and (magic[2:11] == '__GLYPH__'):
-            skip = self.fo.read(1)
+        elif (magic[0:1] == 'p') and (magic[2:9] == '__PAGE_'):
+            skip = self.fo.read(2)
+            first_token = 'info'
+        elif (magic[0:1] == 'g') and (magic[2:9] == '__GLYPH'):
+            skip = self.fo.read(3)
            first_token = 'info'
        else :
            # other0.dat file
            first_token = None
-            self.fo.seek(-11,1)
+            self.fo.seek(-9,1)


        # main loop to read and build the document tree
--- a/Topaz_Tools/lib/decode_meta.py
+++ b/Topaz_Tools/lib/decode_meta.py
@ -1,6 +1,6 @@
 #! /usr/bin/python
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
-# For use with Topaz Scripts Version 2.3
+# For use with Topaz Scripts Version 2.6

 import csv
 import sys
--- a/Topaz_Tools/lib/flatxml2html.py
+++ b/Topaz_Tools/lib/flatxml2html.py
@ -1,6 +1,6 @@
 #! /usr/bin/python
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
-# For use with Topaz Scripts Version 2.3
+# For use with Topaz Scripts Version 2.6

 import sys
 import csv
@ -32,6 +32,8 @@ class DocParser(object):
        self.link_id = []
        self.link_title = []
        self.link_page = []
+        self.link_href = []
+        self.link_type = []
        self.dehyphen_rootid = []
        self.paracont_stemid = []
        self.parastems_stemid = []
@ -197,6 +199,7 @@ class DocParser(object):
    # get the class
    def getClass(self, pclass):
        nclass = pclass
+
        # class names are an issue given topaz may start them with numerals (not allowed),
        # use a mix of cases (which cause some browsers problems), and actually
        # attach numbers after "_reclustered*" to the end to deal classeses that inherit
@ -206,7 +209,10 @@ class DocParser(object):
        # so we clean this up by lowercasing, prepend 'cl-', and getting any baseclass
        # that exists in the stylesheet first, and then adding this specific class
        # after
+        
+        # also some class names have spaces in them so need to convert to dashes
        if nclass != None :
+            nclass = nclass.replace(' ','-')
            classres = ''
            nclass = nclass.lower()
            nclass = 'cl-' + nclass
@ -334,7 +340,7 @@ class DocParser(object):
            result.append(('svg', num))
            return pclass, result

-        # this type of paragrph may be made up of multiple spans, inline 
+        # this type of paragraph may be made up of multiple spans, inline 
        # word monograms (images), and words with semantic meaning, 
        # plus glyphs used to form starting letter of first word
        
@ -391,6 +397,9 @@ class DocParser(object):
                result.append(('img' + word_class, int(argres)))
                word_class = ''

+            elif name.endswith('region.img.src'):
+                result.append(('img' + word_class, int(argres)))
+
            if (sp_first != -1) and (sp_last != -1):
                for wordnum in xrange(sp_first, sp_last):
                    result.append(('ocr', wordnum))
@ -437,6 +446,8 @@ class DocParser(object):
        if (type == 'end'):
            parares += ' '

+        lstart = len(parares)
+
        cnt = len(pdesc)

        for j in xrange( 0, cnt) :
@ -449,18 +460,24 @@ class DocParser(object):

                if handle_links:
                    link = self.link_id[num]
-                    if (link > 0): 
+                    if (link > 0):
+                        linktype = self.link_type[link-1]
                        title = self.link_title[link-1]
-                        if (title == "") or (parares.rfind(title) < 0): 
-                            title='_link_'
-                        ptarget = self.link_page[link-1] - 1
-                        linkhtml = '<a href="#page%04d">' % ptarget
+                        if (title == "") or (parares.rfind(title) < 0):
+                            title=parares[lstart:]
+                        if linktype == 'external' :
+                            linkhref = self.link_href[link-1]
+                            linkhtml = '<a href="%s">' % linkhref
+                        else :
+                            ptarget = self.link_page[link-1] - 1
+                            linkhtml = '<a href="#page%04d">' % ptarget
                        linkhtml += title + '</a>'
                        pos = parares.rfind(title)
                        if pos >= 0:
                            parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
                        else :
                            parares += linkhtml
+                        lstart = len(parares)
                        if word == '_link_' : word = ''
                    elif (link < 0) :
                        if word == '_link_' : word = ''
@ -532,6 +549,14 @@ class DocParser(object):
        # collect link destination page numbers
        self.link_page = self.getData('info.links.page',0,-1)

+        # collect link types (container versus external)
+        (pos, argres) = self.findinDoc('info.links.type',0,-1)
+        if argres :  self.link_type = argres.split('|')
+
+        # collect link destinations
+        (pos, argres) = self.findinDoc('info.links.href',0,-1)
+        if argres :  self.link_href = argres.split('|')
+
        # collect link titles
        (pos, argres) = self.findinDoc('info.links.title',0,-1)
        if argres :
@ -641,16 +666,18 @@ class DocParser(object):
                    htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)


-                elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'):
+                elif (regtype == 'synth_fcvr.center'):
                    (pos, simgsrc) = self.findinDoc('img.src',start,end)
                    if simgsrc:
                        htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)

                else :
-                    print 'Warning: region type', regtype
+                    print '          Making region type', regtype,
                    (pos, temp) = self.findinDoc('paragraph',start,end)
-                    if pos != -1:
-                        print '   is a "text" region'
+                    (pos2, temp) = self.findinDoc('span',start,end)
+                    if pos != -1 or pos2 != -1:
+                        print ' a "text" region'
+                        orig_regtype = regtype
                        regtype = 'fixed'
                        ptype = 'full'
                        # check to see if this is a continution from the previous page
@ -658,6 +685,11 @@ class DocParser(object):
                            ptype = 'end'
                            first_para_continued = False
                        (pclass, pdesc) = self.getParaDescription(start,end, regtype)
+                        if not pclass:
+                            if orig_regtype.endswith('.right')     : pclass = 'cl-right'
+                            elif orig_regtype.endswith('.center')  : pclass = 'cl-center'
+                            elif orig_regtype.endswith('.left')    : pclass = 'cl-left'
+                            elif orig_regtype.endswith('.justify') : pclass = 'cl-justify'
                        if pclass and (ptype == 'full') and (len(pclass) >= 6):
                            tag = 'p'
                            if pclass[3:6] == 'h1-' : tag = 'h4'
@ -669,7 +701,7 @@ class DocParser(object):
                        else :
                            htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
                    else :
-                        print '    is a "graphic" region'
+                        print ' a "graphic" region'
                        (pos, simgsrc) = self.findinDoc('img.src',start,end)
                        if simgsrc:
                            htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
--- a/Topaz_Tools/lib/genhtml.py
+++ b/Topaz_Tools/lib/genhtml.py
@ -1,6 +1,6 @@
 #! /usr/bin/python
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
-# For use with Topaz Scripts Version 2.3
+# For use with Topaz Scripts Version 2.6

 class Unbuffered:
    def __init__(self, stream):
--- a/Topaz_Tools/lib/gensvg.py
+++ b/Topaz_Tools/lib/gensvg.py
@ -1,6 +1,6 @@
 #! /usr/bin/python
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
-# For use with Topaz Scripts Version 2.3
+# For use with Topaz Scripts Version 2.6

 class Unbuffered:
    def __init__(self, stream):
--- a/Topaz_Tools/lib/genxml.py
+++ b/Topaz_Tools/lib/genxml.py
@ -1,6 +1,6 @@
 #! /usr/bin/python
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
-# For use with Topaz Scripts Version 2.3
+# For use with Topaz Scripts Version 2.6

 class Unbuffered:
    def __init__(self, stream):
--- a/Topaz_Tools/lib/getpagedim.py
+++ b/Topaz_Tools/lib/getpagedim.py
@ -1,6 +1,6 @@
 #! /usr/bin/python
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
-# For use with Topaz Scripts Version 2.3
+# For use with Topaz Scripts Version 2.6

 import csv
 import sys
--- a/Topaz_Tools/lib/stylexml2css.py
+++ b/Topaz_Tools/lib/stylexml2css.py
@ -1,6 +1,6 @@
 #! /usr/bin/python
 # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
-# For use with Topaz Scripts Version 2.3
+# For use with Topaz Scripts Version 2.6

 import csv
 import sys
@ -85,7 +85,10 @@ class DocParser(object):
    def process(self):

        classlst = ''
-        csspage = ''
+        csspage = '.cl-center { text-align: center; margin-left: auto; margin-right: auto; }\n'
+        csspage += '.cl-right { text-align: right; }\n'
+        csspage += '.cl-left { text-align: left; }\n'
+        csspage += '.cl-justify { text-align: justify; }\n'

        # generate a list of each <style> starting point in the stylesheet
        styleList= self.posinDoc('book.stylesheet.style')
@ -108,6 +111,7 @@ class DocParser(object):
                # get the style class
                (pos, sclass) = self.findinDoc('style.class',start,end)
                if sclass != None:
+                    sclass = sclass.replace(' ','-')
                    sclass = '.cl-' + sclass.lower()
                else : 
                    sclass = ''
@ -115,6 +119,7 @@ class DocParser(object):
                # check for any "after class" specifiers
                (pos, aftclass) = self.findinDoc('style._after_class',start,end)
                if aftclass != None:
+                    aftclass = aftclass.replace(' ','-')
                    aftclass = '.cl-' + aftclass.lower()
                else : 
                    aftclass = ''
@ -216,7 +221,8 @@ class DocParser(object):
                        if ctype == 'h3_' :
                            csspage += 'h6' + cssline + '\n'

-                    csspage += self.stags[tag] + cssline + '\n'
+                    if cssline != ' { }':
+                        csspage += self.stags[tag] + cssline + '\n'

                
        return csspage, classlst
--- a/Topaz_Tools/lib/topaz-changes.txt
+++ b/Topaz_Tools/lib/topaz-changes.txt
@ -1,4 +1,14 @@
-Canges in 2.3
+Changes in 2.6
+	- fix for many additional version tags
+	- fixes to generate better links
+	- fixes to handle external links
+	- now handles new "marker" page .dat files
+	- improved special region handling
+	- properly handle class names with spaces
+	- handle default alignment for synthetic regions
+
+
+Changes in 2.3
       - fix for use with non-latin1 based systems (thank you Tedd)
       - fixes for out of order tokens in xml

--- a/eReader_Tools/lib/erdr2pml.py
+++ b/eReader_Tools/lib/erdr2pml.py
@ -53,8 +53,9 @@
 #  0.12 - Fix added to prevent lowercasing of image names when the pml code itself uses a different case in the link name.
 #  0.13 - change to unbuffered stdout for use with gui front ends
 #  0.14 - contributed enhancement to support --make-pmlz switch
+#  0.15 - enabled high-ascii to pml character encoding. DropBook now works on Mac.

-__version__='0.14'
+__version__='0.15'

 # Import Psyco if available
 try:
@ -465,17 +466,6 @@ class EreaderProcessor(object):
        data = sect[62:]
        return sanitizeFileName(name), data

-    def cleanPML(self,pml):
-        # Update old \b font tag with correct \B bold font tag
-        pml2 = pml.replace('\\b', '\\B')
-        # Convert special characters to proper PML code.  High ASCII start at (\x82, \a130) and go up to (\xff, \a255)
-        for k in xrange(130,256):
-            # a2b_hex takes in a hexidecimal as a string and converts it 
-            # to a binary ascii code that we search and replace for
-            badChar=binascii.a2b_hex('%02x' % k)
-            pml2 = pml2.replace(badChar, '\\a%03d' % k)
-            #end for k
-        return pml2

    # def getChapterNamePMLOffsetData(self):
    #     cv = ''
@ -564,6 +554,14 @@ class EreaderProcessor(object):

        return r

+def cleanPML(pml):
+	# Convert special characters to proper PML code.  High ASCII start at (\x80, \a128) and go up to (\xff, \a255)
+	pml2 = pml
+	for k in xrange(128,256):
+		badChar = chr(k)
+		pml2 = pml2.replace(badChar, '\\a%03d' % k)
+	return pml2
+
 def convertEreaderToPml(infile, name, cc, outdir):
    if not os.path.exists(outdir):
        os.makedirs(outdir)
@ -585,7 +583,7 @@ def convertEreaderToPml(infile, name, cc, outdir):
    print "   Extracting pml"
    pml_string = er.getText()
    pmlfilename = bookname + ".pml"
-    file(os.path.join(outdir, pmlfilename),'wb').write(pml_string)
+    file(os.path.join(outdir, pmlfilename),'wb').write(cleanPML(pml_string))

    # bkinfo = er.getBookInfo()
    # if bkinfo != '':
@ -677,7 +675,7 @@ def main(argv=None):
            search_time = end_time - start_time
            print 'elapsed time: %.2f seconds' % (search_time, ) 
            if make_pmlz :
-                print 'output in %s' % zipname
+                print 'output is %s' % zipname
            else :
                print 'output in %s' % outdir 
            print "done"