topazscripts 1.8

This commit is contained in:
some_updates 2010-01-24 12:19:20 +00:00 committed by Apprentice Alf
parent c93f8e1edd
commit 24f001c61e
12 changed files with 332 additions and 72 deletions

View File

@ -1,3 +1,22 @@
Changes in version 1.8
- gensvg.py now builds wonderful xhtml pages with embedded svg
that can be easily paged through as if reading a book!
(tested in Safari for Mac and Win and Firefox)
(requires javascript to be enabled)
- genhtml.py now REQUIRES that gensvg.py be run FIRST
this allows create of images on the fly from glyphs
- genhtml.py now automatically makes tables of words into svg
based images and will handle glyph based ornate first
letters of words
- cmbtc_dump_mac_linux.py has been renamed to be
cmbtc_dump_nonK4PC.py to make it clearer
when it needs to be used
Changes in version 1.7
- gensvg.py has been improved so that the glyphs render exactly (ClarkNova)
- gensvg.py has fixed a render order "bug" that allowed some images to cover or hide text. (ClarkNova)
@ -5,7 +24,6 @@ Changes in version 1.7
- add missing <title> tag
- make xhtml compliant doctype and minor changes to write correct xhtml
- make divs that act as anchors be hidden visually and to take up 0 height and 0 width to prevent any impact on layout
- added support for new version of the <_span> tag called <span>
Changes in version 1.6
- support for books whose paragraphs have no styles

View File

@ -1,4 +1,5 @@
#! /usr/bin/python
# For use in Topaz Scripts version 1.8
"""

View File

@ -1,4 +1,5 @@
#! /usr/bin/python
# For use with Topaz Scripts Version 1.8
from __future__ import with_statement

View File

@ -1,5 +1,6 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 1.8
from __future__ import with_statement
import csv

View File

@ -1,5 +1,6 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 1.8
from __future__ import with_statement
import csv

View File

@ -1,21 +1,27 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 1.8
from __future__ import with_statement
import csv
import sys
import os
import math
import getopt
from struct import pack
from struct import unpack
class DocParser(object):
def __init__(self, flatxml, classlst, fileid):
def __init__(self, flatxml, classlst, fileid, bookDir):
self.id = os.path.basename(fileid).replace('.dat','')
self.svgcount = 0
self.docList = flatxml.split('\n')
self.docSize = len(self.docList)
self.classList = {}
self.bookDir = bookDir
self.glyphPaths = { }
self.numPaths = 0
tmpList = classlst.split('\n')
for pclass in tmpList:
if pclass != '':
@ -30,6 +36,107 @@ class DocParser(object):
self.paracont_stemid = []
self.parastems_stemid = []
def getGlyph(self, gid):
result = ''
id='gl%d' % gid
return self.glyphPaths[id]
def glyphs_to_image(self, glyphList):
def extract(path, key):
b = path.find(key) + len(key)
e = path.find(' ',b)
return int(path[b:e])
def extractID(path, key):
b = path.find(key) + len(key)
e = path.find('"',b)
return path[b:e]
svgDir = os.path.join(self.bookDir,'svg')
glyfile = os.path.join(svgDir,'glyphs.svg')
imgDir = os.path.join(self.bookDir,'img')
imgname = self.id + '_%04d.svg' % self.svgcount
imgfile = os.path.join(imgDir,imgname)
# build hash table of glyph paths keyed by glyph id
if self.numPaths == 0:
gfile = open(glyfile, 'r')
while True:
path = gfile.readline()
if (path == ''): break
glyphid = extractID(path,'id="')
self.glyphPaths[glyphid] = path
self.numPaths += 1
gfile.close()
# get glyph information
gxList = self.getData('info.glyph.x',0,-1)
gyList = self.getData('info.glyph.y',0,-1)
gidList = self.getData('info.glyph.glyphID',0,-1)
gids = []
maxws = []
maxhs = []
xs = []
ys = []
gdefs = []
# get path defintions, positions, dimensions for ecah glyph
# that makes up the image, and find min x and min y to reposition origin
minx = -1
miny = -1
for j in glyphList:
gid = gidList[j]
gids.append(gid)
xs.append(gxList[j])
if minx == -1: minx = gxList[j]
else : minx = min(minx, gxList[j])
ys.append(gyList[j])
if miny == -1: miny = gyList[j]
else : miny = min(miny, gyList[j])
path = self.getGlyph(gid)
gdefs.append(path)
maxws.append(extract(path,'width='))
maxhs.append(extract(path,'height='))
# change the origin to minx, miny and calc max height and width
maxw = maxws[0] + xs[0] - minx
maxh = maxhs[0] + ys[0] - miny
for j in xrange(0, len(xs)):
xs[j] = xs[j] - minx
ys[j] = ys[j] - miny
maxw = max( maxw, (maxws[j] + xs[j]) )
maxh = max( maxh, (maxhs[j] + ys[j]) )
# open the image file for output
ifile = open(imgfile,'w')
ifile.write('<?xml version="1.0" standalone="no"?>\n')
ifile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
ifile.write('<svg width="%dpx" height="%dpx" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (math.floor(maxw/10), math.floor(maxh/10), maxw, maxh))
ifile.write('<defs>\n')
for j in xrange(0,len(gdefs)):
ifile.write(gdefs[j])
ifile.write('</defs>\n')
for j in xrange(0,len(gids)):
ifile.write('<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (gids[j], xs[j], ys[j]))
ifile.write('</svg>')
ifile.close()
return 0
# return tag at line pos in document
def lineinDoc(self, pos) :
if (pos >= 0) and (pos < self.docSize) :
@ -77,6 +184,17 @@ class DocParser(object):
return startpos
# returns a vector of integers for the tagpath
def getData(self, tagpath, pos, end):
argres=[]
(foundat, argt) = self.findinDoc(tagpath, pos, end)
if (argt != None) and (len(argt) > 0) :
argList = argt.split('|')
argres = [ int(strval) for strval in argList]
return argres
# build a description of the paragraph
def getParaDescription(self, start, end):
@ -120,6 +238,7 @@ class DocParser(object):
# this type of paragrph may be made up of multiple _spans, inline
# word monograms (images) and words with semantic meaning
# and now a new type "span" versus the old "_span"
# plus glyphs used to form starting letter of first word
# need to parse this type line by line
line = start + 1
@ -143,6 +262,21 @@ class DocParser(object):
result.append(('ocr', wordnum))
line += 1
elif name.endswith('word.firstGlyph') :
first = int(argres)
(name, argres) = self.lineinDoc(line+1)
if not name.endswith('word.lastGlyph'):
print 'Error: - incorrect glyph ordering inside word in paragraph'
last = int(argres)
glyphList = []
for glyphnum in xrange(first, last):
glyphList.append(glyphnum)
num = self.svgcount
self.glyphs_to_image(glyphList)
self.svgcount += 1
result.append(('svg', num))
line += 1
elif name.endswith('word.class'):
(cname, space) = argres.split('-',1)
if space == '' : space = '0'
@ -241,6 +375,11 @@ class DocParser(object):
parares += '<img src="img/img%04d.jpg" alt="" />' % num
parares += sep
elif wtype == 'svg' :
sep = ''
parares += '<img src="img/' + self.id + '_%04d.svg" alt="" />' % num
parares += sep
if len(sep) > 0 : parares = parares[0:-1]
if (type == 'full') or (type == 'end') :
parares += '</p>'
@ -260,10 +399,7 @@ class DocParser(object):
if argres : self.ocrtext = argres.split('|')
# get information to dehyphenate the text
(pos, argres) = self.findinDoc('info.dehyphen.rootID',0,-1)
if argres:
argList = argres.split('|')
self.dehyphen_rootid = [ int(strval) for strval in argList]
self.dehyphen_rootid = self.getData('info.dehyphen.rootID',0,-1)
# determine if first paragraph is continued from previous page
(pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1)
@ -274,16 +410,10 @@ class DocParser(object):
last_para_continued = (self.paracont_stemid != None)
# collect link ids
(pos, argres) = self.findinDoc('info.word.link_id',0,-1)
if argres:
argList = argres.split('|')
self.link_id = [ int(strval) for strval in argList]
self.link_id = self.getData('info.word.link_id',0,-1)
# collect link destination page numbers
(pos, argres) = self.findinDoc('info.links.page',0,-1)
if argres :
argList = argres.split('|')
self.link_page = [ int(strval) for strval in argList]
self.link_page = self.getData('info.links.page',0,-1)
# collect link titles
(pos, argres) = self.findinDoc('info.links.title',0,-1)
@ -382,23 +512,45 @@ class DocParser(object):
elif (regtype == 'table') :
# translate first and last word into first and last glyphs
# and generate table as an image and include a link to it
glyphList = []
(pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
(pos, slast) = self.findinDoc('paragraph.lastWord',start,end)
firstglyphList = self.getData('word.firstGlyph',0,-1)
gidList = self.getData('info.glyph.glyphID',0,-1)
if (sfirst != None) and (slast != None) :
first = int(sfirst)
last = int(slast)
firstGlyph = firstglyphList[first]
if last < len(firstglyphList):
lastGlyph = firstglyphList[last]
else :
lastGlyph = len(gidList)
for glyphnum in xrange(firstGlyph, lastGlyph):
glyphList.append(glyphnum)
num = self.svgcount
self.glyphs_to_image(glyphList)
self.svgcount += 1
htmlpage += '<div class="graphic"><img src="img/' + self.id + '_%04d.svg" alt="" /></div>' % num
else :
ptype = 'full'
if first_para_continued :
ptype = 'end'
first_para_continued = False
(pclass, pdesc) = self.getParaDescription(start,end)
htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
print "Warnings - Table Conversions are notoriously poor"
print "Strongly recommend taking a screen capture image of the "
print "table in %s.svg and using it to replace this attempt at a table" % self.id
print " "
print "Warning: - Table Conversions are notoriously poor"
print " Strongly recommend taking a screen capture image of the "
print " table in %s.svg and using it to replace this attempt at a table" % self.id
print " "
elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'):
(pos, simgsrc) = self.findinDoc('img.src',start,end)
if simgsrc:
htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
else :
print 'Warning: region type', regtype
(pos, temp) = self.findinDoc('paragraph',start,end)
@ -437,10 +589,10 @@ class DocParser(object):
def convert2HTML(flatxml, classlst, fileid):
def convert2HTML(flatxml, classlst, fileid, bookDir):
# create a document parser
dp = DocParser(flatxml, classlst, fileid)
dp = DocParser(flatxml, classlst, fileid, bookDir)
htmlpage = dp.process()

View File

@ -1,5 +1,6 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 1.8
import os, sys, getopt
@ -65,6 +66,12 @@ def main(argv):
print "Can not find image directory in unencrypted book"
sys.exit(-1)
svgDir = os.path.join(bookDir,'svg')
if not os.path.exists(svgDir) :
print "Can not find svg directory in unencrypted book"
print "please run gensvg.py before running genhtml.py"
sys.exit(-1)
otherFile = os.path.join(bookDir,'other0000.dat')
if not os.path.exists(otherFile) :
print "Can not find other0000.dat in unencrypted book"
@ -75,7 +82,6 @@ def main(argv):
print "Can not find metadata0000.dat in unencrypted book"
sys.exit(-1)
htmlFileName = "book.html"
htmlstr = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n'
htmlstr += '<html>\n'
@ -133,7 +139,7 @@ def main(argv):
print ' ', filename
fname = os.path.join(pageDir,filename)
flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname)
htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir)
htmlstr += '</body>\n</html>\n'

View File

@ -1,11 +1,11 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 1.8
import os, sys, getopt
# local routines
import convert2xml
import flatxml2html
import decode_meta
@ -45,6 +45,13 @@ class GParser(object):
argres[j] = int(argres[j])
return result
def getGlyphDim(self, gly):
maxh = (self.gh[gly] * self.dpi) / self.gdpi[gly]
maxw = (self.gw[gly] * self.dpi) / self.gdpi[gly]
return maxh, maxw
def getPath(self, gly):
path = ''
if (gly < 0) or (gly >= self.count):
@ -172,8 +179,10 @@ class PParser(object):
def usage():
print 'Usage: '
print ' '
print ' gensvg.py unencryptedBookDir'
print ' gensvg.py [options] unencryptedBookDir'
print ' '
print ' -x : output browseable XHTML+SVG pages (default)'
print ' -r : output raw SVG images'
def main(argv):
@ -185,7 +194,7 @@ def main(argv):
argv = argv.split()
try:
opts, args = getopt.getopt(argv[1:], "h:")
opts, args = getopt.getopt(argv[1:], "xrh")
except getopt.GetoptError, err:
print str(err)
@ -196,10 +205,15 @@ def main(argv):
usage()
sys.exit(2)
raw = 0
for o, a in opts:
if o =="-h":
usage()
sys.exit(0)
if o =="-x":
raw = 0
if o =="-r":
raw = 1
bookDir = args[0]
@ -264,7 +278,9 @@ def main(argv):
gp = GParser(flat_xml)
for i in xrange(0, gp.count):
path = gp.getPath(i)
glyfile.write('<path id="gl%d" d="%s" fill="black" />\n' % (counter * 256 + i, path))
maxh, maxw = gp.getGlyphDim(i)
# glyfile.write('<path id="gl%d" d="%s" fill="black" />\n' % (counter * 256 + i, path))
glyfile.write('<path id="gl%d" d="%s" fill="black" /><!-- width=%d height=%d -->\n' % (counter * 256 + i, path, maxw, maxh ))
counter += 1
glyfile.write('</defs>\n')
glyfile.write('</svg>\n')
@ -274,7 +290,7 @@ def main(argv):
# Books are at 1440 DPI. This is rendering at twice that size for
# readability when rendering to the screen.
scaledpi = 720
scaledpi = 1440
filenames = os.listdir(pageDir)
filenames = sorted(filenames)
counter = 0
@ -283,11 +299,45 @@ def main(argv):
fname = os.path.join(pageDir,filename)
flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname)
pp = PParser(flat_xml)
if (raw) :
pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w')
else :
pfile = open(os.path.join(svgDir,'page%04d.xhtml' % counter), 'w')
pfile.write('<?xml version="1.0" standalone="no"?>\n')
if (raw):
pfile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
pfile.write('<svg width="%fin" height="%fin" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1))
pfile.write('<title>Page %d - %s by %s</title>\n' % (counter, metadata['Title'],metadata['Authors']))
else:
pfile.write('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n');
pfile.write('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" ><head>\n');
pfile.write('<title>Page %d - %s by %s</title>\n' % (counter, metadata['Title'],metadata['Authors']))
pfile.write('<script><![CDATA[\n');
pfile.write('function gd(){var p=window.location.href.replace(/^.*\?dpi=(\d+).*$/i,"$1");return p;}\n');
pfile.write('var dpi=%d;\n' % scaledpi);
if (counter) :
pfile.write('var prevpage="page%04d.xhtml";\n' % (counter - 1))
if (counter < len(filenames)-1) :
pfile.write('var nextpage="page%04d.xhtml";\n' % (counter + 1))
pfile.write('var pw=%d;var ph=%d;' % (pp.pw, pp.ph))
pfile.write('function zoomin(){dpi=dpi*(2/3);setsize();}\n')
pfile.write('function zoomout(){dpi=dpi*1.5;setsize();}\n')
pfile.write('function setsize(){var svg=document.getElementById("svgimg");var prev=document.getElementById("prevsvg");var next=document.getElementById("nextsvg");var width=(pw/dpi)+"in";var height=(ph/dpi)+"in";svg.setAttribute("width",width);svg.setAttribute("height",height);prev.setAttribute("height",height);prev.setAttribute("width","50px");next.setAttribute("height",height);next.setAttribute("width","50px");}\n')
pfile.write('function ppage(){window.location.href=prevpage+"?dpi="+Math.round(dpi);}\n')
pfile.write('function npage(){window.location.href=nextpage+"?dpi="+Math.round(dpi);}\n')
pfile.write('var gt=gd();if(gt>0){dpi=gt;}\n')
pfile.write('window.onload=setsize;\n')
pfile.write(']]></script>\n')
pfile.write('</head>\n')
pfile.write('<body onLoad="setsize();" style="background-color:#777;text-align:center;">\n')
pfile.write('<div style="white-space:nowrap;">\n')
if (counter == 0) :
pfile.write('<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n')
else:
pfile.write('<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,150,95,5,95,295" fill="#AAAAAA" /></svg></a>\n')
pfile.write('<a href="javascript:npage();"><svg id="svgimg" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" style="background-color:#FFF;border:1px solid black;">' % (pp.pw, pp.ph))
if (pp.gid != None):
pfile.write('<defs>\n')
gdefs = pp.getGlyphs(glyfname)
@ -303,7 +353,18 @@ def main(argv):
pfile.write('<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (pp.gid[j], pp.gx[j], pp.gy[j]))
if (img == None or len(img) == 0) and (pp.gid == None or len(pp.gid) == 0):
pfile.write('<text x="10" y="10" font-family="Helvetica" font-size="100" stroke="black">This page intentionally left blank.</text>\n<text x="10" y="110" font-family="Helvetica" font-size="50" stroke="black">Until this notice unintentionally gave it content. (gensvg.py)</text>\n');
if (raw) :
pfile.write('</svg>')
else :
pfile.write('</svg></a>\n')
if (counter == len(filenames) - 1) :
pfile.write('<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n')
else :
pfile.write('<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,5,5,295,95,150" fill="#AAAAAA" /></svg></a>\n')
pfile.write('</div>\n')
pfile.write('<div><a href="javascript:zoomin();">zoom in</a> - <a href="javascript:zoomout();">zoom out</a></div>\n')
pfile.write('</body>\n')
pfile.write('</html>\n')
pfile.close()
counter += 1

View File

@ -1,5 +1,6 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 1.8
import os, sys, getopt

View File

@ -1,5 +1,6 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 1.8
from __future__ import with_statement
import csv

View File

@ -19,25 +19,16 @@ Here are the steps:
1. Unzip the topazscripts.zip file to get the full set of python scripts.
The files you should have after unzipping are:
cmbtc_dump.py - (author: cmbtc) unencrypts and dumps sections into separate files
decode_meta.py - converts metadata0000.dat to human readable text (for the most part)
cmbtc_dump.py - (author: cmbtc) unencrypts and dumps sections into separate files for Kindle for PC
cmbtc_dump_nonK4PC.py - (author - DiapDealer) for use with standalone Kindle and ipod/iphone topaz books
decode_meta.py - converts metadata0000.dat to make it available
convert2xml.py - converts page*.dat, other*.dat, and glyphs*.dat files to pseudo xml descriptions
flatxml2html.py - converts a "flattened" xml description to html using the ocrtext
stylexml2css.py - converts stylesheet "flattened" xml into css (as best it can)
getpagedim.py - reads page0000.dat to get the book height and width parameters
genxml.py - main program to convert everything to xml
genhtml.py - main program to generate "book.html"
gensvg.py - (author: clarknova) main program to create an svg grpahic of each page
In addition there is now a new file:
cmbtc_dump_mac_linux.py
If you know the pid of your ipod and/or your standalone Kindle and your book
was meant for that device, you can use this program to dump the proper sections
on Mac OSX and Linux (and even Windows if you do not have Kindle4PC installed).
Thank DiapDealer for creating it!
gensvg.py - (author: clarknova) main program to create an xhmtl page with embedded svg graphics
Please note, gensvg.py, genhtml.py, and genxml.py import and use
@ -52,8 +43,20 @@ of its contents as files
All Thanks go to CMBTC who broke the DRM for Topaz - without it nothing else
would be possible
If you purchased the book for Kindle For PC, you must do the following:
cmbtc_dump.py -d -o TARGETDIR [-p pid] YOURTOPAZBOOKNAMEHERE
However, if you purchased the book for a standalone Kindle or ipod/iphone
and you know your pid (at least the first 8 characters) then you should
instead do the following
cmbtc_dump_nonK4PC.py -d -o TARGETDIR -p 12345678 YOURTOPAZBOOKNAMEHERE
where 12345678 should be replaced by the first 8 characters of your PID
This should create a directory called "TARGETDIR" in your current directory.
It should have the following files in it:
@ -64,35 +67,48 @@ page - directory filled with page*.dat files
glyphs - directory filled with glyphs*.dat files
3. REQUIRED: Create xhtml page descriptions with embedded svg
that show the exact representation of each page as an image
with proper glyphs and positioning.
3. Convert the files in "TARGETDIR" to their xml descriptions
which can be found in TARGETDIR/xml/ upon completion.
The step must NOW be done BEFORE attempting conversion to html
genxml.py TARGETDIR
gensvg.py TARGETDIR
When complete, use a web-browser to open the page*.xhtml files
in TARGETDIR/svg/ to see what the book really looks like.
All thanks go to CLARKNOVA for this program. This program is
needed to actually see the true image of each page and so that
the next step can properly create images from glyphs for
monograms, dropcaps and tables.
4. Create book.html which can be found in "TARGETDIR" after
completion. This html conversion can not fully capture
all of the layouts actually used in the book and needs to
be edited to include special font handling such as bold
or italics that can not be determined from the ocrText
information or the style information. If you want to
see things exactly as they were, see step 5 below.
4. Create "book.html" which can be found in "TARGETDIR" after
completion.
genhtml.py TARGETDIR
***IMPORTANT NOTE*** This html conversion can not fully capture
all of the layouts and styles actually used in the book
and the resulting html will need to be edited by hand to
properly set bold and/or italics, handle font size changes,
and to fix the sometimes horiffic mistakes in the ocrText
used to create the html.
5. Create an svg description of each page which can
be found in TARGETDIR/svg/ upon completion.
FYI: Sigil is a wonderful, free cross-
platform program that can be used to edit the html and
create an epub if you so desire.
All thanks go to CLARKNOVA for this program. This program is
needed to actually see the true image of each page so that hand
editing of the html created by step 4 can be done.
Or use the resulting svg files to read each page of the book
exactly as it has been laid out originally.
5. Optional Step: Convert the files in "TARGETDIR" to their
xml descriptions which can be found in TARGETDIR/xml/
upon completion.
gensvg.py TARGETDIR
genxml.py TARGETDIR
These conversions are important for allowing future (and better)
conversions to come later.

View File

@ -1,5 +1,6 @@
#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
# For use with Topaz Scripts Version 1.8
from __future__ import with_statement
import csv