diff --git a/Topaz_Tools/lib/convert2xml.py b/Topaz_Tools/lib/convert2xml.py index 86d08d4..4bec36f 100644 --- a/Topaz_Tools/lib/convert2xml.py +++ b/Topaz_Tools/lib/convert2xml.py @@ -160,101 +160,159 @@ class PageParser(object): # tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped) token_tags = { - 'book' : (1, 'snippets', 1, 0), - 'version' : (1, 'snippets', 1, 0), - 'stylesheet' : (1, 'snippets', 1, 0), - 'links' : (0, 'number', 0, 1), - 'pages' : (0, 'number', 0, 1), - 'page' : (1, 'snippets', 1, 0), - 'group' : (1, 'snippets', 1, 0), - 'region' : (1, 'snippets', 1, 0), - 'reflow' : (1, 'number', 1, 0), - 'img' : (1, 'snippets', 1, 0), - 'paragraph' : (1, 'snippets', 1, 0), - 'extratokens' : (1, 'snippets', 1, 0), - 'style' : (1, 'snippets', 1, 0), - 'rule' : (1, 'snippets', 1, 0), - '_span' : (1, 'snippets', 1, 0), - 'word_semantic': (1, 'snippets', 1, 1), - 'value' : (1, 'scalar_text', 0, 0), + 'x' : (1, 'scalar_number', 0, 0), + 'y' : (1, 'scalar_number', 0, 0), 'h' : (1, 'scalar_number', 0, 0), 'w' : (1, 'scalar_number', 0, 0), 'firstWord' : (1, 'scalar_number', 0, 0), 'lastWord' : (1, 'scalar_number', 0, 0), - 'x' : (1, 'number', 0, 0), - 'y' : (1, 'number', 0, 0), + 'rootID' : (1, 'scalar_number', 0, 0), + 'stemID' : (1, 'scalar_number', 0, 0), + 'type' : (1, 'scalar_text', 0, 0), + + 'info' : (0, 'number', 1, 0), + + 'info.word' : (0, 'number', 1, 1), + 'info.word.ocrText' : (1, 'text', 0, 0), + 'info.word.firstGlyph' : (1, 'raw', 0, 0), + 'info.word.lastGlyph' : (1, 'raw', 0, 0), + 'info.word.bl' : (1, 'raw', 0, 0), + 'info.word.link_id' : (1, 'number', 0, 0), + + 'glyph' : (0, 'number', 1, 1), + 'glyph.x' : (1, 'number', 0, 0), + 'glyph.y' : (1, 'number', 0, 0), + 'glyph.glyphID' : (1, 'number', 0, 0), + + 'dehyphen' : (0, 'number', 1, 1), + 'dehyphen.rootID' : (1, 'number', 0, 0), + 'dehyphen.stemID' : (1, 'number', 0, 0), + 'dehyphen.stemPage' : (1, 'number', 0, 0), + 'dehyphen.sh' : (1, 'number', 0, 0), + + 'links' : (0, 'number', 1, 1), 'links.page' : (1, 'number', 0, 0), - 'link_id' : (1, 'number', 0, 0), - 'glyph' : (0, 'number', 1, 1), + 'links.rel' : (1, 'number', 0, 0), + 'links.row' : (1, 'number', 0, 0), + 'links.title' : (1, 'text', 0, 0), + 'links.href' : (1, 'text', 0, 0), + 'links.type' : (1, 'text', 0, 0), + + 'paraCont' : (0, 'number', 1, 1), + 'paraCont.rootID' : (1, 'number', 0, 0), + 'paraCont.stemID' : (1, 'number', 0, 0), + 'paraCont.stemPage' : (1, 'number', 0, 0), + + 'paraStems' : (0, 'number', 1, 1), + 'paraStems.stemID' : (1, 'number', 0, 0), + + 'wordStems' : (0, 'number', 1, 1), + 'wordStems.stemID' : (1, 'number', 0, 0), + + 'page' : (1, 'snippets', 1, 0), + 'page.pageid' : (1, 'scalar_text', 0, 0), + 'page.pagelabel' : (1, 'scalar_text', 0, 0), + 'page.type' : (1, 'scalar_text', 0, 0), + 'page.h' : (1, 'scalar_number', 0, 0), + 'page.w' : (1, 'scalar_number', 0, 0), + 'page.startID' : (1, 'scalar_number', 0, 0), + + 'group' : (1, 'snippets', 1, 0), + 'group.type' : (1, 'scalar_text', 0, 0), + + 'region' : (1, 'snippets', 1, 0), + 'region.type' : (1, 'scalar_text', 0, 0), + 'region.x' : (1, 'scalar_number', 0, 0), + 'region.y' : (1, 'scalar_number', 0, 0), + 'region.h' : (1, 'scalar_number', 0, 0), + 'region.w' : (1, 'scalar_number', 0, 0), + + 'img' : (1, 'snippets', 1, 0), + 'img.x' : (1, 'scalar_number', 0, 0), + 'img.y' : (1, 'scalar_number', 0, 0), + 'img.h' : (1, 'scalar_number', 0, 0), + 'img.w' : (1, 'scalar_number', 0, 0), + 'img.src' : (1, 'scalar_number', 0, 0), + + 'paragraph' : (1, 'snippets', 1, 0), + 'paragraph.class' : (1, 'scalar_text', 0, 0), + 'paragraph.firstWord' : (1, 'scalar_number', 0, 0), + 'paragraph.lastWord' : (1, 'scalar_number', 0, 0), + + 'word_semantic' : (1, 'snippets', 1, 1), + 'word_semantic.type' : (1, 'scalar_text', 0, 0), + 'word_semantic.firstWord' : (1, 'scalar_number', 0, 0), + 'word_semantic.lastWord' : (1, 'scalar_number', 0, 0), + + 'word' : (1, 'snippets', 1, 0), + 'word.type' : (1, 'scalar_text', 0, 0), + 'word.class' : (1, 'scalar_text', 0, 0), + + '_span' : (1, 'snippets', 1, 0), + '_span.firstWord' : (1, 'scalar_number', 0, 0), + '-span.lastWord' : (1, 'scalar_number', 0, 0), + + 'extratokens' : (1, 'snippets', 1, 0), + 'extratokens.type' : (1, 'scalar_text', 0, 0), + 'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0), + 'extratokens.lastGlyph' : (1, 'scalar_number', 0, 0), + 'glyph.h' : (1, 'number', 0, 0), 'glyph.w' : (1, 'number', 0, 0), - 'sh' : (1, 'number', 0, 0), - 'word' : (0, 'number', 1, 1), - 'src' : (1, 'scalar_number', 0, 0), - 'rel' : (1, 'number', 0, 0), - 'row' : (1, 'number', 0, 0), - 'startID' : (1, 'number', 0, 1), + 'glyph.use' : (1, 'number', 0, 0), + 'glyph.vtx' : (1, 'number', 0, 1), + 'glyph.len' : (1, 'number', 0, 1), + 'glyph.dpi' : (1, 'number', 0, 0), + 'vtx' : (0, 'number', 1, 1), + 'vtx.x' : (1, 'number', 0, 0), + 'vtx.y' : (1, 'number', 0, 0), + 'len' : (0, 'number', 1, 1), + 'len.n' : (1, 'number', 0, 0), + + 'book' : (1, 'snippets', 1, 0), + 'version' : (1, 'snippets', 1, 0), + 'version.FlowEdit_1_id' : (1, 'scalar_text', 0, 0), + 'version.FlowEdit_1_version' : (1, 'scalar_text', 0, 0), + 'version.Schema_id' : (1, 'scalar_text', 0, 0), + 'version.Schema_version' : (1, 'scalar_text', 0, 0), + 'version.Topaz_version' : (1, 'scalar_text', 0, 0), + 'version.WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0), + 'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0), + 'version.ZoneEdit_1_id' : (1, 'scalar_text', 0, 0), + 'version.ZoneEdit_1_version' : (1, 'scalar_text', 0, 0), + 'version.chapterheaders' : (1, 'scalar_text', 0, 0), + 'version.creation_date' : (1, 'scalar_text', 0, 0), + 'version.header_footer' : (1, 'scalar_text', 0, 0), + 'version.init_from_ocr' : (1, 'scalar_text', 0, 0), + 'version.letter_insertion' : (1, 'scalar_text', 0, 0), + 'version.xmlinj_convert' : (1, 'scalar_text', 0, 0), + 'version.xmlinj_reflow' : (1, 'scalar_text', 0, 0), + 'version.xmlinj_transform' : (1, 'scalar_text', 0, 0), + 'version.findlists' : (1, 'scalar_text', 0, 0), + 'version.page_num' : (1, 'scalar_text', 0, 0), + 'version.page_type' : (1, 'scalar_text', 0, 0), + + 'stylesheet' : (1, 'snippets', 1, 0), + 'style' : (1, 'snippets', 1, 0), + 'style._tag' : (1, 'scalar_text', 0, 0), + 'style.type' : (1, 'scalar_text', 0, 0), + 'style._parent_type' : (1, 'scalar_text', 0, 0), + 'style.class' : (1, 'scalar_text', 0, 0), + 'style._after_class' : (1, 'scalar_text', 0, 0), + 'rule' : (1, 'snippets', 1, 0), + 'rule.attr' : (1, 'scalar_text', 0, 0), + 'rule.value' : (1, 'scalar_text', 0, 0), + + 'original' : (0, 'number', 1, 1), + 'original.pnum' : (1, 'number', 0, 0), + 'original.pid' : (1, 'text', 0, 0), + 'pages' : (0, 'number', 1, 1), + 'pages.ref' : (1, 'number', 0, 0), + 'pages.id' : (1, 'number', 0, 0), + 'startID' : (0, 'number', 1, 1), 'startID.page' : (1, 'number', 0, 0), - 'glyphID' : (1, 'number', 0, 0), - 'rootID' : (1, 'number', 0, 0), - 'stemID' : (1, 'number', 0, 0), - 'margin-top' : (1, 'number', 0, 0), - 'stemPage' : (1, 'number', 0, 0), - 'dehyphen' : (1, 'number', 1, 1), - 'rootID' : (1, 'number', 0, 0), - 'paraCont' : (1, 'number', 1, 1), - 'paraStems' : (1, 'number', 1, 1), - 'wordStems' : (1, 'number', 1, 1), - 'original' : (0, 'number', 0, 1), - 'use' : (1, 'number', 0, 0), - 'vtx' : (1, 'number', 0, 1), - 'len' : (1, 'number', 0, 1), - 'dpi' : (1, 'number', 0, 0), - 'n' : (1, 'number', 0, 0), - 'id' : (1, 'number', 0, 0), - 'ref' : (1, 'number', 0, 0), - 'pnum' : (1, 'number', 0, 0), - 'pid' : (1, 'text', 0, 0), - 'info' : (0, 'number', 1, 0), - 'bl' : (1, 'raw', 0, 0), - 'firstGlyph' : (1, 'raw', 0, 0), - 'lastGlyph' : (1, 'raw', 0, 0), - 'ocrText' : (1, 'text', 0, 0), - 'title' : (1, 'text', 0, 0), - 'href' : (1, 'text', 0, 0), - '_parent_type' : (1, 'text', 0, 0), - 'attr' : (1, 'scalar_text', 0, 0), - 'justify' : (1, 'scalar_text', 0, 0), - 'align' : (1, 'scalar_text', 0, 0), - 'layout' : (1, 'scalar_text', 0, 0), - 'pageid' : (1, 'scalar_text', 0, 0), - 'pagelabel' : (1, 'scalar_text', 0, 0), - 'type' : (1, 'text', 0, 0), - 'class' : (1, 'scalar_text', 0, 0), - 'container' : (1, 'scalar_text', 0, 0), - '_after_class' : (1, 'scalar_text', 0, 0), - '_tag' : (1, 'scalar_text', 0, 0), - 'pos' : (1, 'scalar_text', 0, 0), - 'page_num' : (1, 'scalar_text', 0, 0), - 'page_type' : (1, 'scalar_text', 0, 0), - 'findlists' : (1, 'scalar_text', 0, 0), - 'FlowEdit_1_id' : (1, 'scalar_text', 0, 0), - 'FlowEdit_1_version' : (1, 'scalar_text', 0, 0), - 'Schema_id' : (1, 'scalar_text', 0, 0), - 'Schema_version' : (1, 'scalar_text', 0, 0), - 'Topaz_version' : (1, 'scalar_text', 0, 0), - 'WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0), - 'WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0), - 'ZoneEdit_1_id' : (1, 'scalar_text', 0, 0), - 'ZoneEdit_1_version' : (1, 'scalar_text', 0, 0), - 'chapterheaders' : (1, 'scalar_text', 0, 0), - 'creation_date' : (1, 'scalar_text', 0, 0), - 'header_footer' : (1, 'scalar_text', 0, 0), - 'init_from_ocr' : (1, 'scalar_text', 0, 0), - 'letter_insertion' : (1, 'scalar_text', 0, 0), - 'xmlinj_convert' : (1, 'scalar_text', 0, 0), - 'xmlinj_reflow' : (1, 'scalar_text', 0, 0), - 'xmlinj_transform' : (1, 'scalar_text', 0, 0), + 'startID.id' : (1, 'number', 0, 0), + } @@ -404,101 +462,25 @@ class PageParser(object): return - # loop: pass though values unchanged - # DO NOT CHANGE - this has proven to be correct - def doLoop76Mode0(self, argtype, cnt): - result = [] + + # general loop code gracisouly submitted by "skindle" - thank you! + def doLoop76Mode(self, argtype, cnt, mode): + result = [] + adj = 0 + if mode & 1: + adj = readEncodedNumber(self.fo) + mode = mode >> 1 + x = [] for i in xrange(cnt): - result.append(self.formatArg(readEncodedNumber(self.fo), argtype)) - return result - - - # loop generating values relative to the *negative* - # of the offset - don't ask why - it just is - # DO NOT CHANGE - this has proven to be correct - def doLoop76Mode1(self, argtype, cnt): - result = [] - offset = -readEncodedNumber(self.fo) + x.append(readEncodedNumber(self.fo) - adj) + for i in xrange(mode): + for j in xrange(1, cnt): + x[j] = x[j] + x[j - 1] for i in xrange(cnt): - val = readEncodedNumber(self.fo) + offset - result.append(self.formatArg(val, argtype)) + result.append(self.formatArg(x[i],argtype)) return result - # loop generating values with starting value and accumulation - # DO NOT CHANGE - this has proven to be the correct - def doLoop76Mode2(self, argtype, cnt): - result = [] - ptr = readEncodedNumber(self.fo) - result.append(self.formatArg(ptr, argtype)) - for i in xrange(cnt-1): - ptr = ptr + readEncodedNumber(self.fo) - result.append(self.formatArg(ptr, argtype)) - return result - - - # loop generating values with starting value and accumulation - # **after** subtracting adjustment value from each - # DO NOT CHANGE - this has been proven to be correct - def doLoop76Mode3(self, argtype, cnt): - result = [] - adj = readEncodedNumber(self.fo) - ptr = readEncodedNumber(self.fo) - ptr = ptr - adj - result.append(self.formatArg(ptr, argtype)) - for i in xrange(cnt-1): - ptr = ptr + readEncodedNumber(self.fo) - adj - result.append(self.formatArg(ptr,argtype)) - return result - - - # loop using runing sum of data values and starting value - # with accumulation to get new value - # Again, don't ask it took me forever to figure this out - # DO NOT CHANGE - this has been proven to be correct - def doLoop76Mode4(self, argtype, cnt): - result = [] - val = readEncodedNumber(self.fo) - runsum = val - ptr = val - result.append(self.formatArg(ptr, argtype)) - for i in xrange(cnt-1): - runsum += readEncodedNumber(self.fo) - ptr = ptr + runsum - result.append(self.formatArg(ptr,argtype)) - return result - - - # loop using and extra value as an adjustment - # and a running sum of the values after subtracting - # the adjustment, added to a ptr to get a new pointer - def doLoop76Mode5(self, argtype, cnt): - result = [] - adj = readEncodedNumber(self.fo) - ptr = 0 - runsum = 0 - for i in xrange(cnt): - val = readEncodedNumber(self.fo) - runsum += (val - adj) - ptr = ptr +runsum - result.append(self.formatArg(ptr,argtype)) - return result - - - # FIXME: I have only 4 points to work this out with inside my book - # So may be wrong but it is correct for my 4 points - def doLoop76Mode6(self, argtype, cnt): - result = [] - oldval = 0 - for i in xrange(cnt): - val = readEncodedNumber(self.fo) - ptr= (3 * oldval) + val + 1 - result.append(self.formatArg(ptr,argtype)) - oldval = val - return result - - - # dispatches loop commands bytes with various modes # The 0x76 style loops are used to build vectors @@ -507,57 +489,20 @@ class PageParser(object): # since they did not appear in the test cases def decodeCMD(self, cmd, argtype): - - # if (cmd == 0x72): - # self.doLoop72(argtype) - # result =[] - # return result - if (cmd == 0x76): + # loop with cnt, and mode to control loop styles cnt = readEncodedNumber(self.fo) mode = readEncodedNumber(self.fo) - if self.debug : print 'Loop for', cnt, 'with mode', mode, ': ' - - if (mode == 0x00): - return self.doLoop76Mode0(argtype, cnt) - - elif (mode == 0x01): - return self.doLoop76Mode1(argtype, cnt) - - elif (mode == 0x02): - return self.doLoop76Mode2(argtype, cnt) - - elif (mode == 0x03): - return self.doLoop76Mode3(argtype, cnt) - - elif (mode == 0x04): - return self.doLoop76Mode4(argtype, cnt) - - elif (mode == 0x05): - return self.doLoop76Mode5(argtype, cnt) - - elif (mode == 0x06): - return self.doLoop76Mode6(argtype, cnt) - - else: - - if self.debug : - # try to mark any unknown loop comands - # if they exist, unless they are used to process - # text or some other known list, we won't be able to prove them correct - print '*** Unknown Loop 0x%x %d %d :' % (cmd, cnt, mode) - for i in xrange(cnt): - val = readEncodedNumber(self.fo) - print ' 0x%x' % val, - print ' ' - result = [] - return result + if self.debug : print 'Loop for', cnt, 'with mode', mode, ': ' + return self.doLoop76Mode(argtype, cnt, mode) if self.dbug: print "Unknown command", cmd result = [] return result + + # add full tag path to injected snippets def updateName(self, tag, prefix): @@ -727,7 +672,7 @@ class PageParser(object): self.doc.append(tag) else: if self.debug: - print "Mina Loop: Unknown value: %x" % v + print "Main Loop: Unknown value: %x" % v # now do snippet injection diff --git a/Topaz_Tools/lib/flatxml2html.py b/Topaz_Tools/lib/flatxml2html.py index 1a800e8..f93318f 100644 --- a/Topaz_Tools/lib/flatxml2html.py +++ b/Topaz_Tools/lib/flatxml2html.py @@ -11,9 +11,16 @@ from struct import unpack class DocParser(object): - def __init__(self, flatxml, fileid): + def __init__(self, flatxml, classlst, fileid): self.id = os.path.basename(fileid).replace('.dat','') self.flatdoc = flatxml.split('\n') + self.classList = {} + tmpList = classlst.split('\n') + for pclass in tmpList: + if pclass != '': + # remove the leading period from the css name + cname = pclass[1:] + self.classList[cname] = True self.ocrtext = [] self.link_id = [] self.link_title = [] @@ -22,6 +29,18 @@ class DocParser(object): self.paracont_stemid = [] self.parastems_stemid = [] + # find tag if within pos to end inclusive + def lineinDoc(self, pos) : + docList = self.flatdoc + cnt = len(docList) + if (pos >= 0) and (pos < cnt) : + item = docList[pos] + if item.find('=') >= 0: + (name, argres) = item.split('=',1) + else : + name = item + argres = '' + return name, argres # find tag if within pos to end inclusive @@ -61,91 +80,161 @@ class DocParser(object): return startpos - # get a description of the paragraph + # build a description of the paragraph def getParaDescription(self, start, end): + + result = [] + # normal paragraph (pos, pclass) = self.findinDoc('paragraph.class',start,end) - # class names are an issue given topaz starts them with numerals (not allowed) - # use a mix of cases, (which cause some browsers problems), and actually - # attach numbers after "reclustered*" to the end to deal with reflow issues - # so we clean this up by lowercasing, prepend 'cl_', and remove all end pieces after reclustered + # class names are an issue given topaz may start them with numerals (not allowed), + # use a mix of cases (which cause some browsers problems), and actually + # attach numbers after "_reclustered*" to the end to deal with reflow issues + # but then not actually provide all of these _reclustereed classes in the stylesheet! + + # so we clean this up by lowercasing, prepend 'cl_', and if not in the class + # list from the stylesheet, trying once more with "_reclustered*" removed + # if still not in stylesheet, let it pass as is pclass = pclass.lower() pclass = 'cl_' + pclass - p = pclass.find('reclustered') - if p > 0 : pclass = pclass[0:p+11] + if pclass not in self.classList: + p = pclass.find('_reclustered') + if p > 0 : + baseclass = pclass[0:p] + if baseclass in self.classList: + pclass = baseclass + # build up a description of the paragraph in result and return it + # first check for the basic - all words paragraph (pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end) (pos, slast) = self.findinDoc('paragraph.lastWord',start,end) if (sfirst != None) and (slast != None) : - return pclass, int(sfirst), int(slast) + first = int(sfirst) + last = int(slast) + for wordnum in xrange(first, last): + result.append(('ocr', wordnum)) + return pclass, result - # some paragraphs are instead split into multiple spans and some even have word_semantic tags as well - # so walk through this region keeping track of the first firstword, and the last lastWord - # on any items that have it - (pos, sfirst) = self.findinDoc('firstWord',start, end) - first = int(sfirst) - last = -1 - for i in xrange(pos+1,end): - (pos, slast) = self.findinDoc('lastWord',i,i+1) - if slast != None: - last = int(slast) - return pclass, first, last + # this type of paragrph may be made up of multiple _spans, inline + # word monograms (images) and words with semantic meaning + + # need to parse this type line by line + line = start + 1 + word_class = '' + while (line < end) : - def buildParagraph(self, cname, first, last, type, regtype) : + (name, argres) = self.lineinDoc(line) + + if name.endswith('_span.firstWord') : + first = int(argres) + (name, argres) = self.lineinDoc(line+1) + if not name.endswith('_span.lastWord'): + print 'Error: - incorrect _span ordering inside paragraph' + last = int(argres) + for wordnum in xrange(first, last): + result.append(('ocr', wordnum)) + line += 1 + + elif name.endswith('word.class'): + (cname, space) = argres.split('-',1) + if cname == 'spaceafter': + word_class = 'sa' + + elif name.endswith('word.img.src'): + result.append(('img' + word_class, int(argres))) + word_class = '' + + elif name.endswith('word_semantic.firstWord'): + first = int(argres) + (name, argres) = self.lineinDoc(line+1) + if not name.endswith('word_semantic.lastWord'): + print 'Error: - incorrect word_semantic ordering inside paragraph' + last = int(argres) + for wordnum in xrange(first, last): + result.append(('ocr', wordnum)) + line += 1 + + line += 1 + + return pclass, result + + + def buildParagraph(self, cname, pdesc, type, regtype) : parares = '' sep ='' + br_lb = False if (regtype == 'fixed') or (regtype == 'chapterheading') : br_lb = True + handle_links = False if len(self.link_id) > 0: handle_links = True + if (type == 'full') or (type == 'begin') : parares += '

' + if (type == 'end'): parares += ' ' - for j in xrange(first, last) : - word = self.ocrtext[j] - sep = ' ' - if handle_links: - link = self.link_id[j] - if (link > 0): - title = self.link_title[link-1] - if title == "": title='_link_' - ptarget = self.link_page[link-1] - 1 - linkhtml = '' % ptarget - linkhtml += title + '' - pos = parares.rfind(title) - if pos >= 0: - parares = parares[0:pos] + linkhtml + parares[pos+len(title):] + cnt = len(pdesc) + + for j in xrange( 0, cnt) : + + (wtype, num) = pdesc[j] + + if wtype == 'ocr' : + word = self.ocrtext[num] + sep = ' ' + + if handle_links: + link = self.link_id[num] + if (link > 0): + title = self.link_title[link-1] + if title == "": title='_link_' + ptarget = self.link_page[link-1] - 1 + linkhtml = '' % ptarget + linkhtml += title + '' + pos = parares.rfind(title) + if pos >= 0: + parares = parares[0:pos] + linkhtml + parares[pos+len(title):] + else : + parares += linkhtml + if word == '_link_' : word = '' + elif (link < 0) : + if word == '_link_' : word = '' + + if word == '_lb_': + if (num-1) in self.dehyphen_rootid : + word = '' + sep = '' + elif handle_links : + word = '' + sep = '' + elif br_lb : + word = '
\n' + sep = '' else : - parares += linkhtml - if word == '_link_' : word = '' - elif (link < 0) : - if word == '_link_' : word = '' + word = '\n' + sep = '' - if word == '_lb_': - if (j-1) in self.dehyphen_rootid : - word = '' - sep = '' - elif handle_links : - word = '' - sep = '' - elif br_lb : - word = '
\n' - sep = '' - else : - word = '\n' + if num in self.dehyphen_rootid : + word = word[0:-1] sep = '' - if j in self.dehyphen_rootid : - word = word[0:-1] + parares += word + sep + + elif wtype == 'img' : sep = '' + parares += '' % num + parares += sep - parares += word + sep + elif wtype == 'imgsa' : + sep = ' ' + parares += '' % num + parares += sep if len(sep) > 0 : parares = parares[0:-1] if (type == 'full') or (type == 'end') : @@ -222,7 +311,7 @@ class DocParser(object): htmlpage += '

' % int(simgsrc) elif regtype == 'chapterheading' : - (pclass, first, last) = self.getParaDescription(start,end) + (pclass, pdesc) = self.getParaDescription(start,end) if not breakSet: htmlpage += '
 
\n' breakSet = True @@ -234,7 +323,7 @@ class DocParser(object): if pclass[3:7] == 'ch2-' : tag = 'h2' if pclass[3:7] == 'ch3-' : tag = 'h3' htmlpage += '<' + tag + ' class="' + pclass + '">' - htmlpage += self.buildParagraph(pclass,first,last,'middle', regtype) + htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype) htmlpage += '' elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') : @@ -247,17 +336,17 @@ class DocParser(object): if not anchorSet: htmlpage += '
 
\n' anchorSet = True - (pclass, first, last) = self.getParaDescription(start,end) + (pclass, pdesc) = self.getParaDescription(start,end) if ptype == 'full' : tag = 'p' if pclass[3:6] == 'h1-' : tag = 'h4' if pclass[3:6] == 'h2-' : tag = 'h5' if pclass[3:6] == 'h3-' : tag = 'h6' htmlpage += '<' + tag + ' class="' + pclass + '">' - htmlpage += self.buildParagraph(pclass, first, last, 'middle', regtype) + htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype) htmlpage += '' else : - htmlpage += self.buildParagraph(pclass, first, last, ptype, regtype) + htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) elif (regtype == 'tocentry') : @@ -271,12 +360,43 @@ class DocParser(object): if not anchorSet: htmlpage += '
 
\n' anchorSet = True - (pclass, first, last) = self.getParaDescription(start,end) - htmlpage += self.buildParagraph(pclass, first, last, ptype, regtype) + (pclass, pdesc) = self.getParaDescription(start,end) + htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) + + elif regtype == 'synth_fcvr.center' : + if not anchorSet: + htmlpage += '
 
\n' + anchorSet = True + (pos, simgsrc) = self.findinDoc('img.src',start,end) + if simgsrc: + htmlpage += '
' % int(simgsrc) else : - print 'Unknown region type', regtype - print 'Warning: skipping this region' + print 'Warning: Unknown region type', regtype + print 'Treating this like a "fixed" region' + regtype = 'fixed' + ptype = 'full' + # check to see if this is a continution from the previous page + if (len(self.parastems_stemid) > 0): + ptype = 'end' + self.parastems_stemid=[] + else: + if not anchorSet: + htmlpage += '
 
\n' + anchorSet = True + (pclass, desc) = self.getParaDescription(start,end) + if ptype == 'full' : + tag = 'p' + if pclass[3:6] == 'h1-' : tag = 'h4' + if pclass[3:6] == 'h2-' : tag = 'h5' + if pclass[3:6] == 'h3-' : tag = 'h6' + htmlpage += '<' + tag + ' class="' + pclass + '">' + htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype) + htmlpage += '' + else : + htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) + + if len(self.paracont_stemid) > 0 : if htmlpage[-4:] == '

': @@ -289,10 +409,10 @@ class DocParser(object): -def convert2HTML(flatxml, fileid): +def convert2HTML(flatxml, classlst, fileid): # create a document parser - dp = DocParser(flatxml, fileid) + dp = DocParser(flatxml, classlst, fileid) htmlpage = dp.process() diff --git a/Topaz_Tools/lib/genhtml.py b/Topaz_Tools/lib/genhtml.py index be50aae..05261c9 100644 --- a/Topaz_Tools/lib/genhtml.py +++ b/Topaz_Tools/lib/genhtml.py @@ -95,22 +95,27 @@ def main(argv): htmlstr += '\n' htmlstr += '\n' + # get some scaling info from metadata to use while processing styles + fontsize = '135' + if 'fontSize' in meta_array: + fontsize = meta_array['fontSize'] + print ' ', 'other0000.dat' fname = os.path.join(bookDir,'other0000.dat') xname = os.path.join(bookDir, 'style.css') xmlstr = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) - cssstr = '\n' + htmlstr += '\n' htmlstr += '\n\n' for filename in filenames: print ' ', filename fname = os.path.join(pageDir,filename) flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) - htmlstr += flatxml2html.convert2HTML(flat_xml, fname) + htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname) htmlstr += '\n\n' diff --git a/Topaz_Tools/lib/gensvg.py b/Topaz_Tools/lib/gensvg.py index 7df8043..ec1e9f1 100644 --- a/Topaz_Tools/lib/gensvg.py +++ b/Topaz_Tools/lib/gensvg.py @@ -10,286 +10,301 @@ import decode_meta class GParser(object): - def __init__(self, flatxml): - self.flatdoc = flatxml.split('\n') - self.dpi = 1440 - self.gh = self.getData('info.glyph.h') - self.gw = self.getData('info.glyph.w') - self.guse = self.getData('info.glyph.use') - self.count = len(self.guse) - self.gvtx = self.getData('info.glyph.vtx') - self.glen = self.getData('info.glyph.len') - self.gdpi = self.getData('info.glyph.dpi') - self.vx = self.getData('info.vtx.x') - self.vy = self.getData('info.vtx.y') - self.vlen = self.getData('info.len.n') - self.glen.append(len(self.vlen)) - self.gvtx.append(len(self.vx)) + def __init__(self, flatxml): + self.flatdoc = flatxml.split('\n') + self.dpi = 1440 + self.gh = self.getData('info.glyph.h') + self.gw = self.getData('info.glyph.w') + self.guse = self.getData('info.glyph.use') + self.count = len(self.guse) + self.gvtx = self.getData('info.glyph.vtx') + self.glen = self.getData('info.glyph.len') + self.gdpi = self.getData('info.glyph.dpi') + self.vx = self.getData('info.vtx.x') + self.vy = self.getData('info.vtx.y') + self.vlen = self.getData('info.len.n') + self.glen.append(len(self.vlen)) + self.gvtx.append(len(self.vx)) - def getData(self, path): - result = None - cnt = len(self.flatdoc) - for j in xrange(cnt): - item = self.flatdoc[j] - if item.find('=') >= 0: - (name, argt) = item.split('=') - argres = argt.split('|') - else: - name = item - argres = [] - if (name == path): - result = argres - break - if (len(argres) > 0) : - for j in xrange(0,len(argres)): - argres[j] = int(argres[j]) - return result + def getData(self, path): + result = None + cnt = len(self.flatdoc) + for j in xrange(cnt): + item = self.flatdoc[j] + if item.find('=') >= 0: + (name, argt) = item.split('=') + argres = argt.split('|') + else: + name = item + argres = [] + if (name == path): + result = argres + break + if (len(argres) > 0) : + for j in xrange(0,len(argres)): + argres[j] = int(argres[j]) + return result - def getPath(self, gly): - path = '' - if (gly < 0) or (gly >= self.count): - return path - tx = self.vx[self.gvtx[gly]:self.gvtx[gly+1]-1] - ty = self.vy[self.gvtx[gly]:self.gvtx[gly+1]-1] - p = 0 - for k in xrange(self.glen[gly], self.glen[gly+1]): - if (p == 0): - zx = tx[0:self.vlen[k]+1] - zy = ty[0:self.vlen[k]+1] - else: - zx = tx[self.vlen[k-1]+1:self.vlen[k]+1] - zy = ty[self.vlen[k-1]+1:self.vlen[k]+1] - p += 1 - for j in xrange(0, len(zx)): - if (j == 0): - path += 'M %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly]) - else: - path += 'L %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly]) - path += 'z' - return path + def getPath(self, gly): + path = '' + if (gly < 0) or (gly >= self.count): + return path + tx = self.vx[self.gvtx[gly]:self.gvtx[gly+1]-1] + ty = self.vy[self.gvtx[gly]:self.gvtx[gly+1]-1] + p = 0 + for k in xrange(self.glen[gly], self.glen[gly+1]): + if (p == 0): + zx = tx[0:self.vlen[k]+1] + zy = ty[0:self.vlen[k]+1] + else: + zx = tx[self.vlen[k-1]+1:self.vlen[k]+1] + zy = ty[self.vlen[k-1]+1:self.vlen[k]+1] + p += 1 + j = 0 + while ( j < len(zx) ): + if (j == 0): + # Start Position. + path += 'M %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly]) + elif (j <= len(zx)-3): + # Cubic Bezier Curve + path += 'C %d %d %d %d %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly], zx[j+1] * self.dpi / self.gdpi[gly], zy[j+1] * self.dpi / self.gdpi[gly], zx[j+2] * self.dpi / self.gdpi[gly], zy[j+2] * self.dpi / self.gdpi[gly]) + j += 2 + elif (j == len(zx)-2): + # Cubic Bezier Curve to Start Position + path += 'C %d %d %d %d %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly], zx[j+1] * self.dpi / self.gdpi[gly], zy[j+1] * self.dpi / self.gdpi[gly], zx[0] * self.dpi / self.gdpi[gly], zy[0] * self.dpi / self.gdpi[gly]) + j += 1 + elif (j == len(zx)-1): + # Quadratic Bezier Curve to Start Position + path += 'Q %d %d %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly], zx[0] * self.dpi / self.gdpi[gly], zy[0] * self.dpi / self.gdpi[gly]) + + j += 1 + path += 'z' + return path class PParser(object): - def __init__(self, flatxml): - self.flatdoc = flatxml.split('\n') - self.temp = [] - self.ph = self.getData('page.h')[0] - self.pw = self.getData('page.w')[0] - self.gx = self.getData('info.glyph.x') - self.gy = self.getData('info.glyph.y') - self.gid = self.getData('info.glyph.glyphID') + def __init__(self, flatxml): + self.flatdoc = flatxml.split('\n') + self.temp = [] + foo = self.getData('page.h') or self.getData('book.h') + self.ph = foo[0] + foo = self.getData('page.w') or self.getData('book.w') + self.pw = foo[0] + self.gx = self.getData('info.glyph.x') + self.gy = self.getData('info.glyph.y') + self.gid = self.getData('info.glyph.glyphID') - def getData(self, path): - result = None - cnt = len(self.flatdoc) - for j in xrange(cnt): - item = self.flatdoc[j] - if item.find('=') >= 0: - (name, argt) = item.split('=') - argres = argt.split('|') - else: - name = item - argres = [] - if (name.endswith(path)): - result = argres - break - if (len(argres) > 0) : - for j in xrange(0,len(argres)): - argres[j] = int(argres[j]) - return result + def getData(self, path): + result = None + cnt = len(self.flatdoc) + for j in xrange(cnt): + item = self.flatdoc[j] + if item.find('=') >= 0: + (name, argt) = item.split('=') + argres = argt.split('|') + else: + name = item + argres = [] + if (name.endswith(path)): + result = argres + break + if (len(argres) > 0) : + for j in xrange(0,len(argres)): + argres[j] = int(argres[j]) + return result - def getDataTemp(self, path): - result = None - cnt = len(self.temp) - for j in xrange(cnt): - item = self.temp[j] - if item.find('=') >= 0: - (name, argt) = item.split('=') - argres = argt.split('|') - else: - name = item - argres = [] - if (name.endswith(path)): - result = argres - self.temp.pop(j) - break - if (len(argres) > 0) : - for j in xrange(0,len(argres)): - argres[j] = int(argres[j]) - return result + def getDataTemp(self, path): + result = None + cnt = len(self.temp) + for j in xrange(cnt): + item = self.temp[j] + if item.find('=') >= 0: + (name, argt) = item.split('=') + argres = argt.split('|') + else: + name = item + argres = [] + if (name.endswith(path)): + result = argres + self.temp.pop(j) + break + if (len(argres) > 0) : + for j in xrange(0,len(argres)): + argres[j] = int(argres[j]) + return result - def getImages(self): - result = [] - self.temp = self.flatdoc - while (self.getDataTemp('region.img') != None): - h = self.getDataTemp('region.img.h')[0] - w = self.getDataTemp('region.img.w')[0] - x = self.getDataTemp('region.img.x')[0] - y = self.getDataTemp('region.img.y')[0] - src = self.getDataTemp('region.img.src')[0] - result.append('\n' % (src, x, y, w, h)) - return result + def getImages(self): + result = [] + self.temp = self.flatdoc + while (self.getDataTemp('img') != None): + h = self.getDataTemp('img.h')[0] + w = self.getDataTemp('img.w')[0] + x = self.getDataTemp('img.x')[0] + y = self.getDataTemp('img.y')[0] + src = self.getDataTemp('img.src')[0] + result.append('\n' % (src, x, y, w, h)) + return result - def getGlyphs(self,glyfname): - result = [] - if (self.gid != None) and (len(self.gid) > 0): - glyphs = [] - for j in set(self.gid): - glyphs.append(j) - glyphs.sort() - gfile = open(glyfname, 'r') - j = 0 - while True : - inp = gfile.readline() - if (inp == ''): - break - id='id="gl%d"' % glyphs[j] - if (inp.find(id) > 0): - result.append(inp) - j += 1 - if (j == len(glyphs)): - break - gfile.close() - return result + def getGlyphs(self,glyfname): + result = [] + if (self.gid != None) and (len(self.gid) > 0): + glyphs = [] + for j in set(self.gid): + glyphs.append(j) + glyphs.sort() + gfile = open(glyfname, 'r') + j = 0 + while True : + inp = gfile.readline() + if (inp == ''): + break + id='id="gl%d"' % glyphs[j] + if (inp.find(id) > 0): + result.append(inp) + j += 1 + if (j == len(glyphs)): + break + gfile.close() + return result def usage(): - print 'Usage: ' - print ' ' - print ' gensvg.py unencryptedBookDir' - print ' ' + print 'Usage: ' + print ' ' + print ' gensvg.py unencryptedBookDir' + print ' ' def main(argv): - bookDir = '' + bookDir = '' - if len(argv) == 0: - argv = sys.argv - else : - argv = argv.split() + if len(argv) == 0: + argv = sys.argv + else : + argv = argv.split() - try: - opts, args = getopt.getopt(argv[1:], "h:") + try: + opts, args = getopt.getopt(argv[1:], "h:") - except getopt.GetoptError, err: - print str(err) - usage() - sys.exit(2) + except getopt.GetoptError, err: + print str(err) + usage() + sys.exit(2) - if len(opts) == 0 and len(args) == 0 : - usage() - sys.exit(2) + if len(opts) == 0 and len(args) == 0 : + usage() + sys.exit(2) - for o, a in opts: - if o =="-h": - usage() - sys.exit(0) + for o, a in opts: + if o =="-h": + usage() + sys.exit(0) - bookDir = args[0] + bookDir = args[0] - if not os.path.exists(bookDir) : - print "Can not find directory with unencrypted book" - sys.exit(-1) + if not os.path.exists(bookDir) : + print "Can not find directory with unencrypted book" + sys.exit(-1) - dictFile = os.path.join(bookDir,'dict0000.dat') + dictFile = os.path.join(bookDir,'dict0000.dat') - if not os.path.exists(dictFile) : - print "Can not find dict0000.dat file" - sys.exit(-1) + if not os.path.exists(dictFile) : + print "Can not find dict0000.dat file" + sys.exit(-1) - pageDir = os.path.join(bookDir,'page') - if not os.path.exists(pageDir) : - print "Can not find page directory in unencrypted book" - sys.exit(-1) + pageDir = os.path.join(bookDir,'page') + if not os.path.exists(pageDir) : + print "Can not find page directory in unencrypted book" + sys.exit(-1) - imgDir = os.path.join(bookDir,'img') - if not os.path.exists(imgDir) : - print "Can not find image directory in unencrypted book" - sys.exit(-1) + imgDir = os.path.join(bookDir,'img') + if not os.path.exists(imgDir) : + print "Can not find image directory in unencrypted book" + sys.exit(-1) - glyphsDir = os.path.join(bookDir,'glyphs') - if not os.path.exists(glyphsDir) : - print "Can not find glyphs directory in unencrypted book" - sys.exit(-1) + glyphsDir = os.path.join(bookDir,'glyphs') + if not os.path.exists(glyphsDir) : + print "Can not find glyphs directory in unencrypted book" + sys.exit(-1) - metaFile = os.path.join(bookDir,'metadata0000.dat') - if not os.path.exists(metaFile) : - print "Can not find metadata0000.dat in unencrypted book" - sys.exit(-1) + metaFile = os.path.join(bookDir,'metadata0000.dat') + if not os.path.exists(metaFile) : + print "Can not find metadata0000.dat in unencrypted book" + sys.exit(-1) - svgDir = os.path.join(bookDir,'svg') - if not os.path.exists(svgDir) : - os.makedirs(svgDir) + svgDir = os.path.join(bookDir,'svg') + if not os.path.exists(svgDir) : + os.makedirs(svgDir) - print 'Processing Meta Data ... ' + print 'Processing Meta Data ... ' - print ' ', 'metadata0000.dat' - fname = os.path.join(bookDir,'metadata0000.dat') - metadata = decode_meta.getMetaArray(fname) + print ' ', 'metadata0000.dat' + fname = os.path.join(bookDir,'metadata0000.dat') + metadata = decode_meta.getMetaArray(fname) - print 'Processing Glyphs ... ' + print 'Processing Glyphs ... ' - filenames = os.listdir(glyphsDir) - filenames = sorted(filenames) + filenames = os.listdir(glyphsDir) + filenames = sorted(filenames) - glyfname = os.path.join(svgDir,'glyphs.svg') - glyfile = open(glyfname, 'w') - glyfile.write('\n') - glyfile.write('\n') - glyfile.write('\n') - glyfile.write('Glyphs for %s\n' % metadata['Title']) - glyfile.write('\n') - counter = 0 - for filename in filenames: - print ' ', filename - fname = os.path.join(glyphsDir,filename) - flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) - gp = GParser(flat_xml) - for i in xrange(0, gp.count): - path = gp.getPath(i) - glyfile.write('\n' % (counter * 256 + i, path)) - counter += 1 - glyfile.write('\n') - glyfile.write('\n') - glyfile.close() + glyfname = os.path.join(svgDir,'glyphs.svg') + glyfile = open(glyfname, 'w') + glyfile.write('\n') + glyfile.write('\n') + glyfile.write('\n') + glyfile.write('Glyphs for %s\n' % metadata['Title']) + glyfile.write('\n') + counter = 0 + for filename in filenames: + print ' ', filename + fname = os.path.join(glyphsDir,filename) + flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) + gp = GParser(flat_xml) + for i in xrange(0, gp.count): + path = gp.getPath(i) + glyfile.write('\n' % (counter * 256 + i, path)) + counter += 1 + glyfile.write('\n') + glyfile.write('\n') + glyfile.close() - print 'Processing Pages ... ' + print 'Processing Pages ... ' - scaledpi = 720 - filenames = os.listdir(pageDir) - filenames = sorted(filenames) - counter = 0 - for filename in filenames: - print ' ', filename - fname = os.path.join(pageDir,filename) - flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) - pp = PParser(flat_xml) - pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w') - pfile.write('\n') - pfile.write('\n') - pfile.write('\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1)) - pfile.write('Page %d - %s by %s\n' % (counter, metadata['Title'],metadata['Authors'])) - if (pp.gid != None): - pfile.write('\n') - gdefs = pp.getGlyphs(glyfname) - for j in xrange(0,len(gdefs)): - pfile.write(gdefs[j]) - pfile.write('\n') - for j in xrange(0,len(pp.gid)): - pfile.write('\n' % (pp.gid[j], pp.gx[j], pp.gy[j])) - img = pp.getImages() - if (img != None): - for j in xrange(0,len(img)): - pfile.write(img[j]) - pfile.write('') - pfile.close() - counter += 1 + scaledpi = 720 + filenames = os.listdir(pageDir) + filenames = sorted(filenames) + counter = 0 + for filename in filenames: + print ' ', filename + fname = os.path.join(pageDir,filename) + flat_xml = convert2xml.main('convert2xml.py --flat-xml ' + dictFile + ' ' + fname) + pp = PParser(flat_xml) + pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w') + pfile.write('\n') + pfile.write('\n') + pfile.write('\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1)) + pfile.write('Page %d - %s by %s\n' % (counter, metadata['Title'],metadata['Authors'])) + if (pp.gid != None): + pfile.write('\n') + gdefs = pp.getGlyphs(glyfname) + for j in xrange(0,len(gdefs)): + pfile.write(gdefs[j]) + pfile.write('\n') + for j in xrange(0,len(pp.gid)): + pfile.write('\n' % (pp.gid[j], pp.gx[j], pp.gy[j])) + img = pp.getImages() + if (img != None): + for j in xrange(0,len(img)): + pfile.write(img[j]) + pfile.write('') + pfile.close() + counter += 1 - print 'Processing Complete' + print 'Processing Complete' - return 0 + return 0 if __name__ == '__main__': - sys.exit(main('')) + sys.exit(main('')) \ No newline at end of file diff --git a/Topaz_Tools/lib/readme.txt b/Topaz_Tools/lib/readme.txt index 4a79d20..afe4a5a 100644 --- a/Topaz_Tools/lib/readme.txt +++ b/Topaz_Tools/lib/readme.txt @@ -1,3 +1,13 @@ +Contributors: + cmbtc - removal of drm which made all of this possible + clarknova - for all of the svg and glyph generation and many other bug fixes and improvements + skindle - for figuing out the general case for the mode loops + some updates - for conversion to xml, basic html + DiapDealer - for extensive testing and feeback + +and others for posting, feedback and testing + + This is experimental and it will probably not work for you but... ALSO: Please do not use any of this to steal. Theft is wrong. diff --git a/Topaz_Tools/lib/stylexml2css.py b/Topaz_Tools/lib/stylexml2css.py index cf02984..ede6767 100644 --- a/Topaz_Tools/lib/stylexml2css.py +++ b/Topaz_Tools/lib/stylexml2css.py @@ -11,8 +11,9 @@ from struct import unpack class DocParser(object): - def __init__(self, flatxml): + def __init__(self, flatxml, fontsize): self.flatdoc = flatxml.split('\n') + self.fontsize = int(fontsize) stags = { 'paragraph' : 'p', @@ -20,14 +21,14 @@ class DocParser(object): } attr_val_map = { - 'hang' : ('text-indent: ', 135), - 'indent' : ('text-indent: ', 135), - 'line-space' : ('line-height: ', 190), - 'margin-bottom' : ('margin-bottom: ', 135), - 'margin-left' : ('margin-left: ', 135), - 'margin-right' : ('margin-right: ', 135), - 'margin-top' : ('margin-top: ', 135), - 'space-after' : ('padding-bottom: ', 135), + 'hang' : 'text-indent: ', + 'indent' : 'text-indent: ', + 'line-space' : 'line-height: ', + 'margin-bottom' : 'margin-bottom: ', + 'margin-left' : 'margin-left: ', + 'margin-right' : 'margin-right: ', + 'margin-top' : 'margin-top: ', + 'space-after' : 'padding-bottom: ', } attr_str_map = { @@ -55,7 +56,7 @@ class DocParser(object): for j in xrange(pos, end): item = docList[j] if item.find('=') >= 0: - (name, argres) = item.split('=') + (name, argres) = item.split('=',1) else : name = item argres = '' @@ -81,6 +82,7 @@ class DocParser(object): def process(self): + classlst = '' csspage = '' # generate a list of each