diff --git a/Topaz_Tools/lib/cmbtc_dump.py b/Topaz_Tools/lib/cmbtc_dump.py index 83301dd..d7cef99 100644 --- a/Topaz_Tools/lib/cmbtc_dump.py +++ b/Topaz_Tools/lib/cmbtc_dump.py @@ -1,5 +1,5 @@ #! /usr/bin/python -# For use in Topaz Scripts version 2.3 +# For use in Topaz Scripts version 2.6 """ diff --git a/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py b/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py index 1508741..0d62404 100644 --- a/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py +++ b/Topaz_Tools/lib/cmbtc_dump_nonK4PC.py @@ -1,5 +1,5 @@ #!/usr/bin/python -# For use with Topaz Scripts Version 2.3 +# For use with Topaz Scripts Version 2.6 class Unbuffered: def __init__(self, stream): diff --git a/Topaz_Tools/lib/convert2xml.py b/Topaz_Tools/lib/convert2xml.py index 18ae3f0..e3f0fe2 100644 --- a/Topaz_Tools/lib/convert2xml.py +++ b/Topaz_Tools/lib/convert2xml.py @@ -1,6 +1,6 @@ #! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab -# For use with Topaz Scripts Version 2.4 +# For use with Topaz Scripts Version 2.6 class Unbuffered: def __init__(self, stream): @@ -315,6 +315,12 @@ class PageParser(object): 'version.findlists' : (1, 'scalar_text', 0, 0), 'version.page_num' : (1, 'scalar_text', 0, 0), 'version.page_type' : (1, 'scalar_text', 0, 0), + 'version.bad_text' : (1, 'scalar_text', 0, 0), + 'version.glyph_mismatch' : (1, 'scalar_text', 0, 0), + 'version.margins' : (1, 'scalar_text', 0, 0), + 'version.staggered_lines' : (1, 'scalar_text', 0, 0), + 'version.paragraph_continuation' : (1, 'scalar_text', 0, 0), + 'version.toc' : (1, 'scalar_text', 0, 0), 'stylesheet' : (1, 'snippets', 1, 0), 'style' : (1, 'snippets', 1, 0), @@ -662,16 +668,19 @@ class PageParser(object): def process(self): # peek at the first bytes to see what type of file it is - magic = self.fo.read(11) - if (magic[0:1] == 'p') and (magic[2:10] == '__PAGE__'): + magic = self.fo.read(9) + if (magic[0:1] == 'p') and (magic[2:9] == 'marker_'): first_token = 'info' - elif (magic[0:1] == 'g') and (magic[2:11] == '__GLYPH__'): - skip = self.fo.read(1) + elif (magic[0:1] == 'p') and (magic[2:9] == '__PAGE_'): + skip = self.fo.read(2) + first_token = 'info' + elif (magic[0:1] == 'g') and (magic[2:9] == '__GLYPH'): + skip = self.fo.read(3) first_token = 'info' else : # other0.dat file first_token = None - self.fo.seek(-11,1) + self.fo.seek(-9,1) # main loop to read and build the document tree diff --git a/Topaz_Tools/lib/decode_meta.py b/Topaz_Tools/lib/decode_meta.py index 038f133..a63c578 100644 --- a/Topaz_Tools/lib/decode_meta.py +++ b/Topaz_Tools/lib/decode_meta.py @@ -1,6 +1,6 @@ #! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab -# For use with Topaz Scripts Version 2.3 +# For use with Topaz Scripts Version 2.6 import csv import sys diff --git a/Topaz_Tools/lib/flatxml2html.py b/Topaz_Tools/lib/flatxml2html.py index 0fb106d..1c4419f 100644 --- a/Topaz_Tools/lib/flatxml2html.py +++ b/Topaz_Tools/lib/flatxml2html.py @@ -1,6 +1,6 @@ #! /usr/bin/python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab -# For use with Topaz Scripts Version 2.3 +# For use with Topaz Scripts Version 2.6 import sys import csv @@ -32,6 +32,8 @@ class DocParser(object): self.link_id = [] self.link_title = [] self.link_page = [] + self.link_href = [] + self.link_type = [] self.dehyphen_rootid = [] self.paracont_stemid = [] self.parastems_stemid = [] @@ -197,6 +199,7 @@ class DocParser(object): # get the class def getClass(self, pclass): nclass = pclass + # class names are an issue given topaz may start them with numerals (not allowed), # use a mix of cases (which cause some browsers problems), and actually # attach numbers after "_reclustered*" to the end to deal classeses that inherit @@ -206,7 +209,10 @@ class DocParser(object): # so we clean this up by lowercasing, prepend 'cl-', and getting any baseclass # that exists in the stylesheet first, and then adding this specific class # after + + # also some class names have spaces in them so need to convert to dashes if nclass != None : + nclass = nclass.replace(' ','-') classres = '' nclass = nclass.lower() nclass = 'cl-' + nclass @@ -334,7 +340,7 @@ class DocParser(object): result.append(('svg', num)) return pclass, result - # this type of paragrph may be made up of multiple spans, inline + # this type of paragraph may be made up of multiple spans, inline # word monograms (images), and words with semantic meaning, # plus glyphs used to form starting letter of first word @@ -391,6 +397,9 @@ class DocParser(object): result.append(('img' + word_class, int(argres))) word_class = '' + elif name.endswith('region.img.src'): + result.append(('img' + word_class, int(argres))) + if (sp_first != -1) and (sp_last != -1): for wordnum in xrange(sp_first, sp_last): result.append(('ocr', wordnum)) @@ -437,6 +446,8 @@ class DocParser(object): if (type == 'end'): parares += ' ' + lstart = len(parares) + cnt = len(pdesc) for j in xrange( 0, cnt) : @@ -449,18 +460,24 @@ class DocParser(object): if handle_links: link = self.link_id[num] - if (link > 0): + if (link > 0): + linktype = self.link_type[link-1] title = self.link_title[link-1] - if (title == "") or (parares.rfind(title) < 0): - title='_link_' - ptarget = self.link_page[link-1] - 1 - linkhtml = '' % ptarget + if (title == "") or (parares.rfind(title) < 0): + title=parares[lstart:] + if linktype == 'external' : + linkhref = self.link_href[link-1] + linkhtml = '' % linkhref + else : + ptarget = self.link_page[link-1] - 1 + linkhtml = '' % ptarget linkhtml += title + '' pos = parares.rfind(title) if pos >= 0: parares = parares[0:pos] + linkhtml + parares[pos+len(title):] else : parares += linkhtml + lstart = len(parares) if word == '_link_' : word = '' elif (link < 0) : if word == '_link_' : word = '' @@ -532,6 +549,14 @@ class DocParser(object): # collect link destination page numbers self.link_page = self.getData('info.links.page',0,-1) + # collect link types (container versus external) + (pos, argres) = self.findinDoc('info.links.type',0,-1) + if argres : self.link_type = argres.split('|') + + # collect link destinations + (pos, argres) = self.findinDoc('info.links.href',0,-1) + if argres : self.link_href = argres.split('|') + # collect link titles (pos, argres) = self.findinDoc('info.links.title',0,-1) if argres : @@ -641,16 +666,18 @@ class DocParser(object): htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) - elif (regtype == 'synth_fcvr.center') or (regtype == 'synth_text.center'): + elif (regtype == 'synth_fcvr.center'): (pos, simgsrc) = self.findinDoc('img.src',start,end) if simgsrc: htmlpage += '

