topazscripts 2.0

This commit is contained in:
some_updates 2009-01-27 12:20:37 +00:00 committed by Apprentice Alf
parent 24f001c61e
commit 3b4f1fe587
12 changed files with 282 additions and 172 deletions

View File

@ -1,17 +1,29 @@
Changes in version 1.8 Changes in version 2.0
- gensvg.py now accepts two options
-x : output browseable XHTML+SVG pages (default)
-r : output raw SVG images (useful for later conversion to pdf)
- flatxml2html.py now understands page.groups of type graphic
and handles vertical regions as svg images
- genhtml.py now accepts an option
--fixed-image : which will force the conversion
of all fixed regions to svg images
- minor bug fixes and html conversion improvements
Changes in version 1.8
- gensvg.py now builds wonderful xhtml pages with embedded svg - gensvg.py now builds wonderful xhtml pages with embedded svg
that can be easily paged through as if reading a book! that can be easily paged through as if reading a book!
(tested in Safari for Mac and Win and Firefox) (tested in Safari for Mac and Win and Firefox)
(requires javascript to be enabled) (requires javascript to be enabled)
- genhtml.py now REQUIRES that gensvg.py be run FIRST - genhtml.py now REQUIRES that gensvg.py be run FIRST
this allows create of images on the fly from glyphs this allows create of images on the fly from glyphs
- genhtml.py now automatically makes tables of words into svg - genhtml.py now automatically makes tables of words into svg
based images and will handle glyph based ornate first based images and will handle glyph based ornate first
letters of words letters of words
- cmbtc_dump_mac_linux.py has been renamed to be - cmbtc_dump_mac_linux.py has been renamed to be
cmbtc_dump_nonK4PC.py to make it clearer cmbtc_dump_nonK4PC.py to make it clearer
when it needs to be used when it needs to be used

View File

@ -1,5 +1,5 @@
#! /usr/bin/python #! /usr/bin/python
# For use in Topaz Scripts version 1.8 # For use in Topaz Scripts version 2.0
""" """

View File

@ -1,5 +1,5 @@
#! /usr/bin/python #! /usr/bin/python
# For use with Topaz Scripts Version 1.8 # For use with Topaz Scripts Version 2.0
from __future__ import with_statement from __future__ import with_statement

View File

@ -1,6 +1,6 @@
#! /usr/bin/python #! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 1.8 # For use with Topaz Scripts Version 2.0
from __future__ import with_statement from __future__ import with_statement
import csv import csv

View File

@ -1,6 +1,6 @@
#! /usr/bin/python #! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 1.8 # For use with Topaz Scripts Version 2.0
from __future__ import with_statement from __future__ import with_statement
import csv import csv

View File

@ -1,6 +1,6 @@
#! /usr/bin/python #! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 1.8 # For use with Topaz Scripts Version 2.0
from __future__ import with_statement from __future__ import with_statement
import csv import csv
@ -13,7 +13,7 @@ from struct import unpack
class DocParser(object): class DocParser(object):
def __init__(self, flatxml, classlst, fileid, bookDir): def __init__(self, flatxml, classlst, fileid, bookDir, fixedimage):
self.id = os.path.basename(fileid).replace('.dat','') self.id = os.path.basename(fileid).replace('.dat','')
self.svgcount = 0 self.svgcount = 0
self.docList = flatxml.split('\n') self.docList = flatxml.split('\n')
@ -28,6 +28,7 @@ class DocParser(object):
# remove the leading period from the css name # remove the leading period from the css name
cname = pclass[1:] cname = pclass[1:]
self.classList[cname] = True self.classList[cname] = True
self.fixedimage = fixedimage
self.ocrtext = [] self.ocrtext = []
self.link_id = [] self.link_id = []
self.link_title = [] self.link_title = []
@ -194,15 +195,9 @@ class DocParser(object):
return argres return argres
# get the class
# build a description of the paragraph def getClass(self, pclass):
def getParaDescription(self, start, end): nclass = pclass
result = []
# paragraph
(pos, pclass) = self.findinDoc('paragraph.class',start,end)
# class names are an issue given topaz may start them with numerals (not allowed), # class names are an issue given topaz may start them with numerals (not allowed),
# use a mix of cases (which cause some browsers problems), and actually # use a mix of cases (which cause some browsers problems), and actually
# attach numbers after "_reclustered*" to the end to deal classeses that inherit # attach numbers after "_reclustered*" to the end to deal classeses that inherit
@ -212,17 +207,85 @@ class DocParser(object):
# so we clean this up by lowercasing, prepend 'cl-', and getting any baseclass # so we clean this up by lowercasing, prepend 'cl-', and getting any baseclass
# that exists in the stylesheet first, and then adding this specific class # that exists in the stylesheet first, and then adding this specific class
# after # after
if pclass != None : if nclass != None :
classres = '' classres = ''
pclass = pclass.lower() nclass = nclass.lower()
pclass = 'cl-' + pclass nclass = 'cl-' + nclass
p = pclass.find('_') baseclass = ''
# graphic is the base class for captions
if nclass.find('cl-cap-') >=0 :
classres = 'graphic' + ' '
else :
# strip to find baseclass
p = nclass.find('_')
if p > 0 : if p > 0 :
baseclass = pclass[0:p] baseclass = nclass[0:p]
if baseclass in self.classList: if baseclass in self.classList:
classres += baseclass + ' ' classres += baseclass + ' '
classres += pclass classres += nclass
pclass = classres nclass = classres
return nclass
# develop a sorted description of the starting positions of
# groups and regions on the page, as well as the page type
def PageDescription(self):
def compare(x, y):
(xtype, xval) = x
(ytype, yval) = y
if xval > yval:
return 1
if xval == yval:
return 0
return -1
result = []
(pos, pagetype) = self.findinDoc('page.type',0,-1)
groupList = self.posinDoc('page.group')
groupregionList = self.posinDoc('page.group.region')
pageregionList = self.posinDoc('page.region')
# integrate into one list
for j in groupList:
result.append(('grpbeg',j))
for j in groupregionList:
result.append(('gregion',j))
for j in pageregionList:
result.append(('pregion',j))
result.sort(compare)
# insert group end and page end indicators
inGroup = False
j = 0
while True:
if j == len(result): break
rtype = result[j][0]
rval = result[j][1]
if not inGroup and (rtype == 'grpbeg') :
inGroup = True
j = j + 1
elif inGroup and (rtype in ('grpbeg', 'pregion')):
result.insert(j,('grpend',rval))
inGroup = False
else:
j = j + 1
if inGroup:
result.append(('grpend',-1))
result.append(('pageend', -1))
return pagetype, result
# build a description of the paragraph
def getParaDescription(self, start, end, regtype):
result = []
# paragraph
(pos, pclass) = self.findinDoc('paragraph.class',start,end)
pclass = self.getClass(pclass)
# build up a description of the paragraph in result and return it # build up a description of the paragraph in result and return it
# first check for the basic - all words paragraph # first check for the basic - all words paragraph
@ -231,13 +294,49 @@ class DocParser(object):
if (sfirst != None) and (slast != None) : if (sfirst != None) and (slast != None) :
first = int(sfirst) first = int(sfirst)
last = int(slast) last = int(slast)
makeImage = (regtype == 'vertical') or (regtype == 'table')
if self.fixedimage:
makeImage = makeImage or (regtype == 'fixed')
if (pclass != None):
makeImage = makeImage or (pclass.find('.inverted') >= 0)
if self.fixedimage :
makeImage = makeImage or (pclass.find('cl-f-') >= 0)
if not makeImage :
# standard all word paragraph
for wordnum in xrange(first, last): for wordnum in xrange(first, last):
result.append(('ocr', wordnum)) result.append(('ocr', wordnum))
return pclass, result return pclass, result
# this type of paragrph may be made up of multiple _spans, inline # convert paragraph to svg image
# word monograms (images) and words with semantic meaning # translate first and last word into first and last glyphs
# and now a new type "span" versus the old "_span" # and generate inline image and include it
glyphList = []
firstglyphList = self.getData('word.firstGlyph',0,-1)
gidList = self.getData('info.glyph.glyphID',0,-1)
firstGlyph = firstglyphList[first]
if last < len(firstglyphList):
lastGlyph = firstglyphList[last]
else :
lastGlyph = len(gidList)
for glyphnum in xrange(firstGlyph, lastGlyph):
glyphList.append(glyphnum)
# include any extratokens if they exist
(pos, sfg) = self.findinDoc('extratokens.firstGlyph',start,end)
(pos, slg) = self.findinDoc('extratokens.lastGlyph',start,end)
if (sfg != None) and (slg != None):
for glyphnum in xrange(int(sfg), int(slg)):
glyphList.append(glyphnum)
num = self.svgcount
self.glyphs_to_image(glyphList)
self.svgcount += 1
result.append(('svg', num))
return pclass, result
# this type of paragrph may be made up of multiple spans, inline
# word monograms (images), and words with semantic meaning,
# plus glyphs used to form starting letter of first word # plus glyphs used to form starting letter of first word
# need to parse this type line by line # need to parse this type line by line
@ -252,6 +351,7 @@ class DocParser(object):
(name, argres) = self.lineinDoc(line) (name, argres) = self.lineinDoc(line)
# handle both span and _span
if name.endswith('span.firstWord') : if name.endswith('span.firstWord') :
first = int(argres) first = int(argres)
(name, argres) = self.lineinDoc(line+1) (name, argres) = self.lineinDoc(line+1)
@ -422,43 +522,56 @@ class DocParser(object):
else: else:
self.link_title.append('') self.link_title.append('')
# get a descriptions of the starting points of the regions
# get page type # and groups on the page
(pos, pagetype) = self.findinDoc('page.type',0,-1) (pagetype, pageDesc) = self.PageDescription()
regcnt = len(pageDesc) - 1
# generate a list of each region starting point
# each region has one paragraph,, or one image, or one chapterheading
regionList= self.posinDoc('region')
regcnt = len(regionList)
regionList.append(-1)
anchorSet = False anchorSet = False
breakSet = False breakSet = False
inGroup = False
# process each region tag and convert what you can to html # process each region on the page and convert what you can to html
for j in xrange(regcnt): for j in xrange(regcnt):
start = regionList[j] (etype, start) = pageDesc[j]
end = regionList[j+1] (ntype, end) = pageDesc[j+1]
(pos, regtype) = self.findinDoc('region.type',start,end)
# set anchor for link target on this page # set anchor for link target on this page
if not anchorSet and not first_para_continued: if not anchorSet and not first_para_continued:
htmlpage += '<div style="visibility: hidden; height: 0; width: 0;" id="' + self.id + '" title="pagetype_' + pagetype + '"></div>\n' htmlpage += '<div style="visibility: hidden; height: 0; width: 0;" id="'
htmlpage += self.id + '" title="pagetype_' + pagetype + '"></div>\n'
anchorSet = True anchorSet = True
# handle groups of graphics with text captions
if (etype == 'grpbeg'):
(pos, grptype) = self.findinDoc('group.type', start, end)
if grptype != None:
if grptype == 'graphic':
gcstr = ' class="' + grptype + '"'
htmlpage += '<div' + gcstr + '>'
inGroup = True
elif (etype == 'grpend'):
if inGroup:
htmlpage += '</div>\n'
inGroup = False
else:
(pos, regtype) = self.findinDoc('region.type',start,end)
if regtype == 'graphic' : if regtype == 'graphic' :
(pos, simgsrc) = self.findinDoc('img.src',start,end) (pos, simgsrc) = self.findinDoc('img.src',start,end)
if simgsrc: if simgsrc:
if inGroup:
htmlpage += '<img src="img/img%04d.jpg" alt="" />' % int(simgsrc)
else:
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc) htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
elif regtype == 'chapterheading' : elif regtype == 'chapterheading' :
(pclass, pdesc) = self.getParaDescription(start,end) (pclass, pdesc) = self.getParaDescription(start,end, regtype)
if not breakSet: if not breakSet:
htmlpage += '<div style="page-break-after: always;">&nbsp;</div>\n' htmlpage += '<div style="page-break-after: always;">&nbsp;</div>\n'
breakSet = True breakSet = True
@ -473,14 +586,13 @@ class DocParser(object):
htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype) htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
htmlpage += '</' + tag + '>' htmlpage += '</' + tag + '>'
elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem'): elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem'):
ptype = 'full' ptype = 'full'
# check to see if this is a continution from the previous page # check to see if this is a continution from the previous page
if first_para_continued : if first_para_continued :
ptype = 'end' ptype = 'end'
first_para_continued = False first_para_continued = False
(pclass, pdesc) = self.getParaDescription(start,end) (pclass, pdesc) = self.getParaDescription(start,end, regtype)
if pclass and (len(pclass) >= 6) and (ptype == 'full'): if pclass and (len(pclass) >= 6) and (ptype == 'full'):
tag = 'p' tag = 'p'
if pclass[3:6] == 'h1-' : tag = 'h4' if pclass[3:6] == 'h1-' : tag = 'h4'
@ -492,60 +604,26 @@ class DocParser(object):
else : else :
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
elif (regtype == 'tocentry') : elif (regtype == 'tocentry') :
ptype = 'full' ptype = 'full'
if first_para_continued : if first_para_continued :
ptype = 'end' ptype = 'end'
first_para_continued = False first_para_continued = False
(pclass, pdesc) = self.getParaDescription(start,end) (pclass, pdesc) = self.getParaDescription(start,end, regtype)
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
elif (regtype == 'vertical') : elif (regtype == 'vertical') or (regtype == 'table') :
ptype = 'full' ptype = 'full'
if inGroup:
ptype = 'middle'
if first_para_continued : if first_para_continued :
ptype = 'end' ptype = 'end'
first_para_continued = False first_para_continued = False
(pclass, pdesc) = self.getParaDescription(start,end) (pclass, pdesc) = self.getParaDescription(start, end, regtype)
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
elif (regtype == 'table') :
# translate first and last word into first and last glyphs
# and generate table as an image and include a link to it
glyphList = []
(pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
(pos, slast) = self.findinDoc('paragraph.lastWord',start,end)
firstglyphList = self.getData('word.firstGlyph',0,-1)
gidList = self.getData('info.glyph.glyphID',0,-1)
if (sfirst != None) and (slast != None) :
first = int(sfirst)
last = int(slast)
firstGlyph = firstglyphList[first]
if last < len(firstglyphList):
lastGlyph = firstglyphList[last]
else :
lastGlyph = len(gidList)
for glyphnum in xrange(firstGlyph, lastGlyph):
glyphList.append(glyphnum)
num = self.svgcount
self.glyphs_to_image(glyphList)
self.svgcount += 1
htmlpage += '<div class="graphic"><img src="img/' + self.id + '_%04d.svg" alt="" /></div>' % num
else :
ptype = 'full'
if first_para_continued :
ptype = 'end'
first_para_continued = False
(pclass, pdesc) = self.getParaDescription(start,end)
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
print " "
print "Warning: - Table Conversions are notoriously poor"
print " Strongly recommend taking a screen capture image of the "
print " table in %s.svg and using it to replace this attempt at a table" % self.id
print " "
elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'): elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'):
(pos, simgsrc) = self.findinDoc('img.src',start,end) (pos, simgsrc) = self.findinDoc('img.src',start,end)
if simgsrc: if simgsrc:
@ -562,7 +640,7 @@ class DocParser(object):
if first_para_continued : if first_para_continued :
ptype = 'end' ptype = 'end'
first_para_continued = False first_para_continued = False
(pclass, pdesc) = self.getParaDescription(start,end) (pclass, pdesc) = self.getParaDescription(start,end, regtype)
if pclass and (ptype == 'full') and (len(pclass) >= 6): if pclass and (ptype == 'full') and (len(pclass) >= 6):
tag = 'p' tag = 'p'
if pclass[3:6] == 'h1-' : tag = 'h4' if pclass[3:6] == 'h1-' : tag = 'h4'
@ -589,10 +667,10 @@ class DocParser(object):
def convert2HTML(flatxml, classlst, fileid, bookDir): def convert2HTML(flatxml, classlst, fileid, bookDir, fixedimage):
# create a document parser # create a document parser
dp = DocParser(flatxml, classlst, fileid, bookDir) dp = DocParser(flatxml, classlst, fileid, bookDir, fixedimage)
htmlpage = dp.process() htmlpage = dp.process()

View File

@ -1,6 +1,6 @@
#! /usr/bin/python #! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 1.8 # For use with Topaz Scripts Version 2.0
import os, sys, getopt import os, sys, getopt
@ -14,13 +14,16 @@ import getpagedim
def usage(): def usage():
print 'Usage: ' print 'Usage: '
print ' ' print ' '
print ' genhtml.py unencryptedBookDir' print ' genhtml.py [--fixed-image] unencryptedBookDir'
print ' '
print ' Options: '
print ' --fixed-image : force translation of fixed regions into svg images '
print ' ' print ' '
def main(argv): def main(argv):
bookDir = '' bookDir = ''
fixedimage = False
if len(argv) == 0: if len(argv) == 0:
argv = sys.argv argv = sys.argv
@ -28,7 +31,7 @@ def main(argv):
argv = argv.split() argv = argv.split()
try: try:
opts, args = getopt.getopt(argv[1:], "h:") opts, args = getopt.getopt(argv[1:], "h:",["fixed-image"])
except getopt.GetoptError, err: except getopt.GetoptError, err:
print str(err) print str(err)
@ -43,6 +46,8 @@ def main(argv):
if o =="-h": if o =="-h":
usage() usage()
sys.exit(0) sys.exit(0)
if o =="--fixed-image":
fixedimage = True
bookDir = args[0] bookDir = args[0]
@ -139,7 +144,7 @@ def main(argv):
print ' ', filename print ' ', filename
fname = os.path.join(pageDir,filename) fname = os.path.join(pageDir,filename)
flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir) htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, fixedimage)
htmlstr += '</body>\n</html>\n' htmlstr += '</body>\n</html>\n'

View File

@ -1,6 +1,6 @@
#! /usr/bin/python #! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 1.8 # For use with Topaz Scripts Version 2.0
import os, sys, getopt import os, sys, getopt

View File

@ -1,6 +1,6 @@
#! /usr/bin/python #! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 1.8 # For use with Topaz Scripts Version 2.0
import os, sys, getopt import os, sys, getopt

View File

@ -1,6 +1,6 @@
#! /usr/bin/python #! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 1.8 # For use with Topaz Scripts Version 2.0
from __future__ import with_statement from __future__ import with_statement
import csv import csv

View File

@ -31,9 +31,8 @@ genhtml.py - main program to generate "book.html"
gensvg.py - (author: clarknova) main program to create an xhmtl page with embedded svg graphics gensvg.py - (author: clarknova) main program to create an xhmtl page with embedded svg graphics
Please note, gensvg.py, genhtml.py, and genxml.py import and use Please note, these scripts all import code from each other so please
decode_meta.py, convert2xml.py, flatxml2html.py, getpagedim.py and stylexml2css.py keep all of these python scripts together in the same place.
so please keep all of these python scripts together in the same place.
@ -78,6 +77,12 @@ The step must NOW be done BEFORE attempting conversion to html
When complete, use a web-browser to open the page*.xhtml files When complete, use a web-browser to open the page*.xhtml files
in TARGETDIR/svg/ to see what the book really looks like. in TARGETDIR/svg/ to see what the book really looks like.
If you would prefer pure svg pages, then use the -r option
as follows:
gensvg.py -r TARGETDIR
All thanks go to CLARKNOVA for this program. This program is All thanks go to CLARKNOVA for this program. This program is
needed to actually see the true image of each page and so that needed to actually see the true image of each page and so that
the next step can properly create images from glyphs for the next step can properly create images from glyphs for
@ -97,6 +102,16 @@ properly set bold and/or italics, handle font size changes,
and to fix the sometimes horiffic mistakes in the ocrText and to fix the sometimes horiffic mistakes in the ocrText
used to create the html. used to create the html.
If there critical pages that need fixed layout in your book
you might want to consider forcing these fixed regions to
become svg images using the command instead
genhtml.py --fixed-image TARGETDIR
This will convert all fixed regions into svg images at the
expense of increased book size, slower loading speed, and
a loss of the ability to search for words in those regions
FYI: Sigil is a wonderful, free cross- FYI: Sigil is a wonderful, free cross-
platform program that can be used to edit the html and platform program that can be used to edit the html and
create an epub if you so desire. create an epub if you so desire.

View File

@ -1,6 +1,6 @@
#! /usr/bin/python #! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 1.8 # For use with Topaz Scripts Version 2.0
from __future__ import with_statement from __future__ import with_statement
import csv import csv