From a1fec0b54d5d5db2e2681d2dc333eb2d34919caf Mon Sep 17 00:00:00 2001 From: some_updates Date: Wed, 20 Jan 2010 12:13:31 +0000 Subject: [PATCH] topazscripts 1.5 --- Topaz_Tools/lib/changes.txt | 20 +++++++ Topaz_Tools/lib/convert2xml.py | 5 +- Topaz_Tools/lib/flatxml2html.py | 96 +++++++++++++++++++-------------- Topaz_Tools/lib/genhtml.py | 20 ++++++- Topaz_Tools/lib/getpagedim.py | 53 ++++++++++++++++++ Topaz_Tools/lib/readme.txt | 6 ++- Topaz_Tools/lib/stylexml2css.py | 46 ++++++++++------ 7 files changed, 186 insertions(+), 60 deletions(-) create mode 100644 Topaz_Tools/lib/changes.txt create mode 100644 Topaz_Tools/lib/getpagedim.py diff --git a/Topaz_Tools/lib/changes.txt b/Topaz_Tools/lib/changes.txt new file mode 100644 index 0000000..cc2f00a --- /dev/null +++ b/Topaz_Tools/lib/changes.txt @@ -0,0 +1,20 @@ +Changes in version 1.5 + - completely reworked generation of styles to use actual page heights and widths + - added new script getpagedim.py to support the above + - style names with underscores in them are now properly paired with their base class + - fixed hanging indents that did not ever set a left margin + - added support for a number of not previously known region types + - added support for a previously unknown snippet - + - corrected a bug that caused unknown regions to abort the program + - added code to make the handling of unknown regions better in general + - corrected a bug that caused the last link on a page to be missing (if it was the last thing on the page) + +Changes in version 1.3 + - font generation by gensvg.py is now greatly improved with support for contour points added + - support for more region types + - support for inline images in paragraphs or text fields (ie. initial graphics for the first letter of a word) + - greatly improved dtd information used for the xml to prevent parsing mistakes + +Version 1.0 + - initial release + diff --git a/Topaz_Tools/lib/convert2xml.py b/Topaz_Tools/lib/convert2xml.py index 4bec36f..07741a7 100644 --- a/Topaz_Tools/lib/convert2xml.py +++ b/Topaz_Tools/lib/convert2xml.py @@ -93,7 +93,7 @@ def convert(i): for j in xrange(len(val)): c = ord(val[j:j+1]) result += '%02x' % c - return result + return result @@ -209,6 +209,8 @@ class PageParser(object): 'wordStems' : (0, 'number', 1, 1), 'wordStems.stemID' : (1, 'number', 0, 0), + 'empty' : (1, 'snippets', 1, 0), + 'page' : (1, 'snippets', 1, 0), 'page.pageid' : (1, 'scalar_text', 0, 0), 'page.pagelabel' : (1, 'scalar_text', 0, 0), @@ -750,6 +752,7 @@ def main(argv): # read in the string table dictionary dict = Dictionary(dictFile) + # dict.dumpDict() # create a page parser pp = PageParser(pageFile, dict, debug, flat_xml) diff --git a/Topaz_Tools/lib/flatxml2html.py b/Topaz_Tools/lib/flatxml2html.py index f93318f..f2dd244 100644 --- a/Topaz_Tools/lib/flatxml2html.py +++ b/Topaz_Tools/lib/flatxml2html.py @@ -90,20 +90,23 @@ class DocParser(object): # class names are an issue given topaz may start them with numerals (not allowed), # use a mix of cases (which cause some browsers problems), and actually - # attach numbers after "_reclustered*" to the end to deal with reflow issues - # but then not actually provide all of these _reclustereed classes in the stylesheet! + # attach numbers after "_reclustered*" to the end to deal classeses that inherit + # from a base class (but then not actually provide all of these _reclustereed + # classes in the stylesheet! - # so we clean this up by lowercasing, prepend 'cl_', and if not in the class - # list from the stylesheet, trying once more with "_reclustered*" removed - # if still not in stylesheet, let it pass as is + # so we clean this up by lowercasing, prepend 'cl_', and getting any baseclass + # that exists in the stylesheet first, and then adding this specific class + # after + classres = '' pclass = pclass.lower() - pclass = 'cl_' + pclass - if pclass not in self.classList: - p = pclass.find('_reclustered') - if p > 0 : - baseclass = pclass[0:p] - if baseclass in self.classList: - pclass = baseclass + pclass = 'cl-' + pclass + p = pclass.find('_') + if p > 0 : + baseclass = pclass[0:p] + if baseclass in self.classList: + classres += baseclass + ' ' + classres += pclass + pclass = classres # build up a description of the paragraph in result and return it # first check for the basic - all words paragraph @@ -123,6 +126,12 @@ class DocParser(object): line = start + 1 word_class = '' + # if end is -1 then we must search to end of document + if end == -1 : + docList = self.flatdoc + cnt = len(docList) + end = cnt + while (line < end) : (name, argres) = self.lineinDoc(line) @@ -139,7 +148,8 @@ class DocParser(object): elif name.endswith('word.class'): (cname, space) = argres.split('-',1) - if cname == 'spaceafter': + if space == '' : space = '0' + if (cname == 'spaceafter') and (int(space) > 0) : word_class = 'sa' elif name.endswith('word.img.src'): @@ -166,7 +176,7 @@ class DocParser(object): sep ='' br_lb = False - if (regtype == 'fixed') or (regtype == 'chapterheading') : + if (regtype == 'fixed') or (regtype == 'chapterheading'): br_lb = True handle_links = False @@ -193,7 +203,8 @@ class DocParser(object): link = self.link_id[num] if (link > 0): title = self.link_title[link-1] - if title == "": title='_link_' + if (title == "") or (parares.rfind(title) < 0): + title='_link_' ptarget = self.link_page[link-1] - 1 linkhtml = '' % ptarget linkhtml += title + '' @@ -326,7 +337,7 @@ class DocParser(object): htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype) htmlpage += '' - elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') : + elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem') : ptype = 'full' # check to see if this is a continution from the previous page if (len(self.parastems_stemid) > 0): @@ -348,7 +359,6 @@ class DocParser(object): else : htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) - elif (regtype == 'tocentry') : ptype = 'full' # check to see if this is a continution from the previous page @@ -363,7 +373,7 @@ class DocParser(object): (pclass, pdesc) = self.getParaDescription(start,end) htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) - elif regtype == 'synth_fcvr.center' : + elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'): if not anchorSet: htmlpage += '
 
\n' anchorSet = True @@ -373,30 +383,38 @@ class DocParser(object): else : print 'Warning: Unknown region type', regtype - print 'Treating this like a "fixed" region' - regtype = 'fixed' - ptype = 'full' - # check to see if this is a continution from the previous page - if (len(self.parastems_stemid) > 0): - ptype = 'end' - self.parastems_stemid=[] - else: + (pos, temp) = self.findinDoc('paragraph',start,end) + if temp: + print 'Treating this like a "text" region' + regtype = 'fixed' + ptype = 'full' + # check to see if this is a continution from the previous page + if (len(self.parastems_stemid) > 0): + ptype = 'end' + self.parastems_stemid=[] + else: + if not anchorSet: + htmlpage += '
 
\n' + anchorSet = True + (pclass, pdesc) = self.getParaDescription(start,end) + if ptype == 'full' : + tag = 'p' + if pclass[3:6] == 'h1-' : tag = 'h4' + if pclass[3:6] == 'h2-' : tag = 'h5' + if pclass[3:6] == 'h3-' : tag = 'h6' + htmlpage += '<' + tag + ' class="' + pclass + '">' + htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype) + htmlpage += '' + else : + htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) + else : + print 'Treating this like a "image" region' if not anchorSet: htmlpage += '
 
\n' anchorSet = True - (pclass, desc) = self.getParaDescription(start,end) - if ptype == 'full' : - tag = 'p' - if pclass[3:6] == 'h1-' : tag = 'h4' - if pclass[3:6] == 'h2-' : tag = 'h5' - if pclass[3:6] == 'h3-' : tag = 'h6' - htmlpage += '<' + tag + ' class="' + pclass + '">' - htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype) - htmlpage += '' - else : - htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) - - + (pos, simgsrc) = self.findinDoc('img.src',start,end) + if simgsrc: + htmlpage += '
' % int(simgsrc) if len(self.paracont_stemid) > 0 : if htmlpage[-4:] == '

': diff --git a/Topaz_Tools/lib/genhtml.py b/Topaz_Tools/lib/genhtml.py index 05261c9..df39539 100644 --- a/Topaz_Tools/lib/genhtml.py +++ b/Topaz_Tools/lib/genhtml.py @@ -8,7 +8,7 @@ import convert2xml import flatxml2html import decode_meta import stylexml2css - +import getpagedim def usage(): print 'Usage: ' @@ -86,6 +86,7 @@ def main(argv): htmlstr += '\n' + # process metadata and retrieve fontSize info print ' ', 'metadata0000.dat' fname = os.path.join(bookDir,'metadata0000.dat') xname = os.path.join(bookDir, 'metadata.txt') @@ -100,12 +101,27 @@ def main(argv): if 'fontSize' in meta_array: fontsize = meta_array['fontSize'] + # also get the size of a normal text page + spage = '1' + if 'firstTextPage' in meta_array: + spage = meta_array['firstTextPage'] + pnum = int(spage) + + # get page height and width from first text page for use in stylesheet scaling + pname = 'page%04d.dat' % pnum + fname = os.path.join(pageDir,pname) + flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) + (ph, pw) = getpagedim.getPageDim(flat_xml) + if (ph == '-1') : ph = 11000 + if (pw == '-1') : pw = 8500 + + # now build up the style sheet print ' ', 'other0000.dat' fname = os.path.join(bookDir,'other0000.dat') xname = os.path.join(bookDir, 'style.css') xmlstr = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) htmlstr += '\n' diff --git a/Topaz_Tools/lib/getpagedim.py b/Topaz_Tools/lib/getpagedim.py new file mode 100644 index 0000000..dd1071c --- /dev/null +++ b/Topaz_Tools/lib/getpagedim.py @@ -0,0 +1,53 @@ +#! /usr/bin/python +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab + +from __future__ import with_statement +import csv +import sys +import os +import getopt +from struct import pack +from struct import unpack + + +class DocParser(object): + def __init__(self, flatxml): + self.flatdoc = flatxml.split('\n') + + + # find tag if within pos to end inclusive + def findinDoc(self, tagpath, pos, end) : + result = None + docList = self.flatdoc + cnt = len(docList) + if end == -1 : + end = cnt + else: + end = min(cnt,end) + foundat = -1 + for j in xrange(pos, end): + item = docList[j] + if item.find('=') >= 0: + (name, argres) = item.split('=') + else : + name = item + argres = '' + if name.endswith(tagpath) : + result = argres + foundat = j + break + return foundat, result + + def process(self): + (pos, sph) = self.findinDoc('page.h',0,-1) + (pos, spw) = self.findinDoc('page.w',0,-1) + if (sph == None): sph = '-1' + if (spw == None): spw = '-1' + return sph, spw + + +def getPageDim(flatxml): + # create a document parser + dp = DocParser(flatxml) + (ph, pw) = dp.process() + return ph, pw diff --git a/Topaz_Tools/lib/readme.txt b/Topaz_Tools/lib/readme.txt index afe4a5a..c9fcb61 100644 --- a/Topaz_Tools/lib/readme.txt +++ b/Topaz_Tools/lib/readme.txt @@ -3,7 +3,8 @@ Contributors: clarknova - for all of the svg and glyph generation and many other bug fixes and improvements skindle - for figuing out the general case for the mode loops some updates - for conversion to xml, basic html - DiapDealer - for extensive testing and feeback + DiapDealer - for extensive testing and feedback + stewball - for extensive testing and feedback and others for posting, feedback and testing @@ -23,12 +24,13 @@ decode_meta.py - converts metadata0000.dat to human readable text (for the most convert2xml.py - converts page*.dat, other*.dat, and glyphs*.dat files to pseudo xml descriptions flatxml2html.py - converts a "flattened" xml description to html using the ocrtext stylexml2css.py - converts stylesheet "flattened" xml into css (as best it can) +getpagedim.py - reads page0000.dat to get the book height and width parameters genxml.py - main program to convert everything to xml genhtml.py - main program to generate "book.html" gensvg.py - (author: clarknova) main program to create an svg grpahic of each page Please note, gensvg.py, genhtml.py, and genxml.py import and use -decode_meta.py, convert2xml.py, flatxml2html.py, and stylexml2css.py +decode_meta.py, convert2xml.py, flatxml2html.py, getpagedim.py and stylexml2css.py so please keep all of these python scripts together in the same place. diff --git a/Topaz_Tools/lib/stylexml2css.py b/Topaz_Tools/lib/stylexml2css.py index ede6767..0d2739b 100644 --- a/Topaz_Tools/lib/stylexml2css.py +++ b/Topaz_Tools/lib/stylexml2css.py @@ -11,9 +11,11 @@ from struct import unpack class DocParser(object): - def __init__(self, flatxml, fontsize): + def __init__(self, flatxml, fontsize, ph, pw): self.flatdoc = flatxml.split('\n') self.fontsize = int(fontsize) + self.ph = int(ph) * 1.0 + self.pw = int(pw) * 1.0 stags = { 'paragraph' : 'p', @@ -106,14 +108,14 @@ class DocParser(object): # get the style class (pos, sclass) = self.findinDoc('style.class',start,end) if sclass != None: - sclass = '.cl_' + sclass.lower() + sclass = '.cl-' + sclass.lower() else : sclass = '' # check for any "after class" specifiers (pos, aftclass) = self.findinDoc('style._after_class',start,end) if aftclass != None: - aftclass = '.cl_' + aftclass.lower() + aftclass = '.cl-' + aftclass.lower() else : aftclass = '' @@ -121,8 +123,8 @@ class DocParser(object): while True : - (pos, attr) = self.findinDoc('style.rule.attr', start, end) - (pos, val) = self.findinDoc('style.rule.value', start, end) + (pos1, attr) = self.findinDoc('style.rule.attr', start, end) + (pos2, val) = self.findinDoc('style.rule.value', start, end) if attr == None : break @@ -135,28 +137,34 @@ class DocParser(object): # handle value based attributes if attr in self.attr_val_map : name = self.attr_val_map[attr] - scale = self.fontsize - if attr == 'line-space': scale = scale * 1.41 + if attr in ('margin-bottom', 'margin-top', 'space-after') : + scale = self.ph + elif attr in ('margin-right', 'indent', 'margin-left', 'hang') : + scale = self.pw + elif attr == 'line-space': + scale = self.fontsize * 2.0 + if not ((attr == 'hang') and (int(val) == 0)) : - ems = int(val)/scale - cssargs[attr] = (self.attr_val_map[attr], ems) + pv = float(val)/scale + cssargs[attr] = (self.attr_val_map[attr], pv) keep = True - start = pos + 1 + start = max(pos1, pos2) + 1 # disable all of the after class tags until I figure out how to handle them if aftclass != "" : keep = False if keep : - # make sure line-space does not go below 1em + # make sure line-space does not go below 100% or above 300% since + # it can be wacky in some styles if 'line-space' in cssargs: seg = cssargs['line-space'][0] val = cssargs['line-space'][1] if val < 1.0: val = 1.0 + if val > 3.0: val = 3.0 del cssargs['line-space'] cssargs['line-space'] = (self.attr_val_map['line-space'], val) - # handle modifications for css style hanging indents if 'hang' in cssargs: @@ -166,11 +174,13 @@ class DocParser(object): cssargs['hang'] = (self.attr_val_map['hang'], -hval) mval = 0 mseg = 'margin-left: ' + mval = hval if 'margin-left' in cssargs: mseg = cssargs['margin-left'][0] mval = cssargs['margin-left'][1] + if mval < 0: mval = 0 mval = hval + mval - cssargs['margin-left'] = (mseg, mval) + cssargs['margin-left'] = (mseg, mval) if 'indent' in cssargs: del cssargs['indent'] @@ -181,7 +191,7 @@ class DocParser(object): if mval == '': cssline += mseg + ' ' else : - aseg = mseg + '%.1fem;' % mval + aseg = mseg + '%.1f%%;' % (mval * 100.0) cssline += aseg + ' ' cssline += '}' @@ -213,10 +223,14 @@ class DocParser(object): -def convert2CSS(flatxml, fontsize): +def convert2CSS(flatxml, fontsize, ph, pw): + + print ' ', 'Using font size:',fontsize + print ' ', 'Using page height:', ph + print ' ', 'Using page width:', pw # create a document parser - dp = DocParser(flatxml, fontsize) + dp = DocParser(flatxml, fontsize, ph, pw) csspage = dp.process()