mirror of
				https://github.com/noDRM/DeDRM_tools.git
				synced 2025-10-23 23:07:47 -04:00 
			
		
		
		
	More fixes for Amazon books, fixing identity checks, started on Topaz.
This commit is contained in:
		
							parent
							
								
									dc27c36761
								
							
						
					
					
						commit
						939cdbb0c9
					
				
					 8 changed files with 530 additions and 512 deletions
				
			
		|  | @ -56,7 +56,7 @@ def readEncodedNumber(file): | ||||||
|             c = file.read(1) |             c = file.read(1) | ||||||
|             if (len(c) == 0): |             if (len(c) == 0): | ||||||
|                 return None |                 return None | ||||||
|             data = ord(c) |             data = c[0] | ||||||
|             datax = (datax <<7) + (data & 0x7F) |             datax = (datax <<7) + (data & 0x7F) | ||||||
|         data = datax |         data = datax | ||||||
| 
 | 
 | ||||||
|  | @ -188,232 +188,232 @@ class PageParser(object): | ||||||
|     # tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped) |     # tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped) | ||||||
| 
 | 
 | ||||||
|     token_tags = { |     token_tags = { | ||||||
|         'x'            : (1, 'scalar_number', 0, 0), |         b'x'            : (1, 'scalar_number', 0, 0), | ||||||
|         'y'            : (1, 'scalar_number', 0, 0), |         b'y'            : (1, 'scalar_number', 0, 0), | ||||||
|         'h'            : (1, 'scalar_number', 0, 0), |         b'h'            : (1, 'scalar_number', 0, 0), | ||||||
|         'w'            : (1, 'scalar_number', 0, 0), |         b'w'            : (1, 'scalar_number', 0, 0), | ||||||
|         'firstWord'    : (1, 'scalar_number', 0, 0), |         b'firstWord'    : (1, 'scalar_number', 0, 0), | ||||||
|         'lastWord'     : (1, 'scalar_number', 0, 0), |         b'lastWord'     : (1, 'scalar_number', 0, 0), | ||||||
|         'rootID'       : (1, 'scalar_number', 0, 0), |         b'rootID'       : (1, 'scalar_number', 0, 0), | ||||||
|         'stemID'       : (1, 'scalar_number', 0, 0), |         b'stemID'       : (1, 'scalar_number', 0, 0), | ||||||
|         'type'         : (1, 'scalar_text', 0, 0), |         b'type'         : (1, 'scalar_text', 0, 0), | ||||||
| 
 | 
 | ||||||
|         'info'            : (0, 'number', 1, 0), |         b'info'            : (0, 'number', 1, 0), | ||||||
| 
 | 
 | ||||||
|         'info.word'            : (0, 'number', 1, 1), |         b'info.word'            : (0, 'number', 1, 1), | ||||||
|         'info.word.ocrText'    : (1, 'text', 0, 0), |         b'info.word.ocrText'    : (1, 'text', 0, 0), | ||||||
|         'info.word.firstGlyph' : (1, 'raw', 0, 0), |         b'info.word.firstGlyph' : (1, 'raw', 0, 0), | ||||||
|         'info.word.lastGlyph'  : (1, 'raw', 0, 0), |         b'info.word.lastGlyph'  : (1, 'raw', 0, 0), | ||||||
|         'info.word.bl'         : (1, 'raw', 0, 0), |         b'info.word.bl'         : (1, 'raw', 0, 0), | ||||||
|         'info.word.link_id'    : (1, 'number', 0, 0), |         b'info.word.link_id'    : (1, 'number', 0, 0), | ||||||
| 
 | 
 | ||||||
|         'glyph'           : (0, 'number', 1, 1), |         b'glyph'           : (0, 'number', 1, 1), | ||||||
|         'glyph.x'         : (1, 'number', 0, 0), |         b'glyph.x'         : (1, 'number', 0, 0), | ||||||
|         'glyph.y'         : (1, 'number', 0, 0), |         b'glyph.y'         : (1, 'number', 0, 0), | ||||||
|         'glyph.glyphID'   : (1, 'number', 0, 0), |         b'glyph.glyphID'   : (1, 'number', 0, 0), | ||||||
| 
 | 
 | ||||||
|         'dehyphen'          : (0, 'number', 1, 1), |         b'dehyphen'          : (0, 'number', 1, 1), | ||||||
|         'dehyphen.rootID'   : (1, 'number', 0, 0), |         b'dehyphen.rootID'   : (1, 'number', 0, 0), | ||||||
|         'dehyphen.stemID'   : (1, 'number', 0, 0), |         b'dehyphen.stemID'   : (1, 'number', 0, 0), | ||||||
|         'dehyphen.stemPage' : (1, 'number', 0, 0), |         b'dehyphen.stemPage' : (1, 'number', 0, 0), | ||||||
|         'dehyphen.sh'       : (1, 'number', 0, 0), |         b'dehyphen.sh'       : (1, 'number', 0, 0), | ||||||
| 
 | 
 | ||||||
|         'links'        : (0, 'number', 1, 1), |         b'links'        : (0, 'number', 1, 1), | ||||||
|         'links.page'   : (1, 'number', 0, 0), |         b'links.page'   : (1, 'number', 0, 0), | ||||||
|         'links.rel'    : (1, 'number', 0, 0), |         b'links.rel'    : (1, 'number', 0, 0), | ||||||
|         'links.row'    : (1, 'number', 0, 0), |         b'links.row'    : (1, 'number', 0, 0), | ||||||
|         'links.title'  : (1, 'text', 0, 0), |         b'links.title'  : (1, 'text', 0, 0), | ||||||
|         'links.href'   : (1, 'text', 0, 0), |         b'links.href'   : (1, 'text', 0, 0), | ||||||
|         'links.type'   : (1, 'text', 0, 0), |         b'links.type'   : (1, 'text', 0, 0), | ||||||
|         'links.id'     : (1, 'number', 0, 0), |         b'links.id'     : (1, 'number', 0, 0), | ||||||
| 
 | 
 | ||||||
|         'paraCont'          : (0, 'number', 1, 1), |         b'paraCont'          : (0, 'number', 1, 1), | ||||||
|         'paraCont.rootID'   : (1, 'number', 0, 0), |         b'paraCont.rootID'   : (1, 'number', 0, 0), | ||||||
|         'paraCont.stemID'   : (1, 'number', 0, 0), |         b'paraCont.stemID'   : (1, 'number', 0, 0), | ||||||
|         'paraCont.stemPage' : (1, 'number', 0, 0), |         b'paraCont.stemPage' : (1, 'number', 0, 0), | ||||||
| 
 | 
 | ||||||
|         'paraStems'        : (0, 'number', 1, 1), |         b'paraStems'        : (0, 'number', 1, 1), | ||||||
|         'paraStems.stemID' : (1, 'number', 0, 0), |         b'paraStems.stemID' : (1, 'number', 0, 0), | ||||||
| 
 | 
 | ||||||
|         'wordStems'          : (0, 'number', 1, 1), |         b'wordStems'          : (0, 'number', 1, 1), | ||||||
|         'wordStems.stemID'   : (1, 'number', 0, 0), |         b'wordStems.stemID'   : (1, 'number', 0, 0), | ||||||
| 
 | 
 | ||||||
|         'empty'          : (1, 'snippets', 1, 0), |         b'empty'          : (1, 'snippets', 1, 0), | ||||||
| 
 | 
 | ||||||
|         'page'           : (1, 'snippets', 1, 0), |         b'page'           : (1, 'snippets', 1, 0), | ||||||
|         'page.class'     : (1, 'scalar_text', 0, 0), |         b'page.class'     : (1, 'scalar_text', 0, 0), | ||||||
|         'page.pageid'    : (1, 'scalar_text', 0, 0), |         b'page.pageid'    : (1, 'scalar_text', 0, 0), | ||||||
|         'page.pagelabel' : (1, 'scalar_text', 0, 0), |         b'page.pagelabel' : (1, 'scalar_text', 0, 0), | ||||||
|         'page.type'      : (1, 'scalar_text', 0, 0), |         b'page.type'      : (1, 'scalar_text', 0, 0), | ||||||
|         'page.h'         : (1, 'scalar_number', 0, 0), |         b'page.h'         : (1, 'scalar_number', 0, 0), | ||||||
|         'page.w'         : (1, 'scalar_number', 0, 0), |         b'page.w'         : (1, 'scalar_number', 0, 0), | ||||||
|         'page.startID' : (1, 'scalar_number', 0, 0), |         b'page.startID' : (1, 'scalar_number', 0, 0), | ||||||
| 
 | 
 | ||||||
|         'group'           : (1, 'snippets', 1, 0), |         b'group'           : (1, 'snippets', 1, 0), | ||||||
|         'group.class'     : (1, 'scalar_text', 0, 0), |         b'group.class'     : (1, 'scalar_text', 0, 0), | ||||||
|         'group.type'      : (1, 'scalar_text', 0, 0), |         b'group.type'      : (1, 'scalar_text', 0, 0), | ||||||
|         'group._tag'      : (1, 'scalar_text', 0, 0), |         b'group._tag'      : (1, 'scalar_text', 0, 0), | ||||||
|         'group.orientation': (1, 'scalar_text', 0, 0), |         b'group.orientation': (1, 'scalar_text', 0, 0), | ||||||
| 
 | 
 | ||||||
|         'region'           : (1, 'snippets', 1, 0), |         b'region'           : (1, 'snippets', 1, 0), | ||||||
|         'region.class'     : (1, 'scalar_text', 0, 0), |         b'region.class'     : (1, 'scalar_text', 0, 0), | ||||||
|         'region.type'      : (1, 'scalar_text', 0, 0), |         b'region.type'      : (1, 'scalar_text', 0, 0), | ||||||
|         'region.x'         : (1, 'scalar_number', 0, 0), |         b'region.x'         : (1, 'scalar_number', 0, 0), | ||||||
|         'region.y'         : (1, 'scalar_number', 0, 0), |         b'region.y'         : (1, 'scalar_number', 0, 0), | ||||||
|         'region.h'         : (1, 'scalar_number', 0, 0), |         b'region.h'         : (1, 'scalar_number', 0, 0), | ||||||
|         'region.w'         : (1, 'scalar_number', 0, 0), |         b'region.w'         : (1, 'scalar_number', 0, 0), | ||||||
|         'region.orientation' : (1, 'scalar_text', 0, 0), |         b'region.orientation' : (1, 'scalar_text', 0, 0), | ||||||
| 
 | 
 | ||||||
|         'empty_text_region' : (1, 'snippets', 1, 0), |         b'empty_text_region' : (1, 'snippets', 1, 0), | ||||||
| 
 | 
 | ||||||
|         'img'                   : (1, 'snippets', 1, 0), |         b'img'                   : (1, 'snippets', 1, 0), | ||||||
|         'img.x'                 : (1, 'scalar_number', 0, 0), |         b'img.x'                 : (1, 'scalar_number', 0, 0), | ||||||
|         'img.y'                 : (1, 'scalar_number', 0, 0), |         b'img.y'                 : (1, 'scalar_number', 0, 0), | ||||||
|         'img.h'                 : (1, 'scalar_number', 0, 0), |         b'img.h'                 : (1, 'scalar_number', 0, 0), | ||||||
|         'img.w'                 : (1, 'scalar_number', 0, 0), |         b'img.w'                 : (1, 'scalar_number', 0, 0), | ||||||
|         'img.src'               : (1, 'scalar_number', 0, 0), |         b'img.src'               : (1, 'scalar_number', 0, 0), | ||||||
|         'img.color_src'         : (1, 'scalar_number', 0, 0), |         b'img.color_src'         : (1, 'scalar_number', 0, 0), | ||||||
|         'img.gridSize'          : (1, 'scalar_number', 0, 0), |         b'img.gridSize'          : (1, 'scalar_number', 0, 0), | ||||||
|         'img.gridBottomCenter'  : (1, 'scalar_number', 0, 0), |         b'img.gridBottomCenter'  : (1, 'scalar_number', 0, 0), | ||||||
|         'img.gridTopCenter'     : (1, 'scalar_number', 0, 0), |         b'img.gridTopCenter'     : (1, 'scalar_number', 0, 0), | ||||||
|         'img.gridBeginCenter'   : (1, 'scalar_number', 0, 0), |         b'img.gridBeginCenter'   : (1, 'scalar_number', 0, 0), | ||||||
|         'img.gridEndCenter'     : (1, 'scalar_number', 0, 0), |         b'img.gridEndCenter'     : (1, 'scalar_number', 0, 0), | ||||||
|         'img.image_type'        : (1, 'scalar_number', 0, 0), |         b'img.image_type'        : (1, 'scalar_number', 0, 0), | ||||||
| 
 | 
 | ||||||
|         'paragraph'           : (1, 'snippets', 1, 0), |         b'paragraph'           : (1, 'snippets', 1, 0), | ||||||
|         'paragraph.class'     : (1, 'scalar_text', 0, 0), |         b'paragraph.class'     : (1, 'scalar_text', 0, 0), | ||||||
|         'paragraph.firstWord' : (1, 'scalar_number', 0, 0), |         b'paragraph.firstWord' : (1, 'scalar_number', 0, 0), | ||||||
|         'paragraph.lastWord'  : (1, 'scalar_number', 0, 0), |         b'paragraph.lastWord'  : (1, 'scalar_number', 0, 0), | ||||||
|         'paragraph.lastWord'  : (1, 'scalar_number', 0, 0), |         b'paragraph.lastWord'  : (1, 'scalar_number', 0, 0), | ||||||
|         'paragraph.gridSize'  : (1, 'scalar_number', 0, 0), |         b'paragraph.gridSize'  : (1, 'scalar_number', 0, 0), | ||||||
|         'paragraph.gridBottomCenter'  : (1, 'scalar_number', 0, 0), |         b'paragraph.gridBottomCenter'  : (1, 'scalar_number', 0, 0), | ||||||
|         'paragraph.gridTopCenter'     : (1, 'scalar_number', 0, 0), |         b'paragraph.gridTopCenter'     : (1, 'scalar_number', 0, 0), | ||||||
|         'paragraph.gridBeginCenter'   : (1, 'scalar_number', 0, 0), |         b'paragraph.gridBeginCenter'   : (1, 'scalar_number', 0, 0), | ||||||
|         'paragraph.gridEndCenter'     : (1, 'scalar_number', 0, 0), |         b'paragraph.gridEndCenter'     : (1, 'scalar_number', 0, 0), | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|         'word_semantic'           : (1, 'snippets', 1, 1), |         b'word_semantic'           : (1, 'snippets', 1, 1), | ||||||
|         'word_semantic.type'      : (1, 'scalar_text', 0, 0), |         b'word_semantic.type'      : (1, 'scalar_text', 0, 0), | ||||||
|         'word_semantic.class'     : (1, 'scalar_text', 0, 0), |         b'word_semantic.class'     : (1, 'scalar_text', 0, 0), | ||||||
|         'word_semantic.firstWord' : (1, 'scalar_number', 0, 0), |         b'word_semantic.firstWord' : (1, 'scalar_number', 0, 0), | ||||||
|         'word_semantic.lastWord'  : (1, 'scalar_number', 0, 0), |         b'word_semantic.lastWord'  : (1, 'scalar_number', 0, 0), | ||||||
|         'word_semantic.gridBottomCenter'  : (1, 'scalar_number', 0, 0), |         b'word_semantic.gridBottomCenter'  : (1, 'scalar_number', 0, 0), | ||||||
|         'word_semantic.gridTopCenter'     : (1, 'scalar_number', 0, 0), |         b'word_semantic.gridTopCenter'     : (1, 'scalar_number', 0, 0), | ||||||
|         'word_semantic.gridBeginCenter'   : (1, 'scalar_number', 0, 0), |         b'word_semantic.gridBeginCenter'   : (1, 'scalar_number', 0, 0), | ||||||
|         'word_semantic.gridEndCenter'     : (1, 'scalar_number', 0, 0), |         b'word_semantic.gridEndCenter'     : (1, 'scalar_number', 0, 0), | ||||||
| 
 | 
 | ||||||
|         'word'            : (1, 'snippets', 1, 0), |         b'word'            : (1, 'snippets', 1, 0), | ||||||
|         'word.type'       : (1, 'scalar_text', 0, 0), |         b'word.type'       : (1, 'scalar_text', 0, 0), | ||||||
|         'word.class'      : (1, 'scalar_text', 0, 0), |         b'word.class'      : (1, 'scalar_text', 0, 0), | ||||||
|         'word.firstGlyph' : (1, 'scalar_number', 0, 0), |         b'word.firstGlyph' : (1, 'scalar_number', 0, 0), | ||||||
|         'word.lastGlyph'  : (1, 'scalar_number', 0, 0), |         b'word.lastGlyph'  : (1, 'scalar_number', 0, 0), | ||||||
| 
 | 
 | ||||||
|         '_span'           : (1, 'snippets', 1, 0), |         b'_span'           : (1, 'snippets', 1, 0), | ||||||
|         '_span.class'     : (1, 'scalar_text', 0, 0), |         b'_span.class'     : (1, 'scalar_text', 0, 0), | ||||||
|         '_span.firstWord' : (1, 'scalar_number', 0, 0), |         b'_span.firstWord' : (1, 'scalar_number', 0, 0), | ||||||
|         '_span.lastWord'  : (1, 'scalar_number', 0, 0), |         b'_span.lastWord'  : (1, 'scalar_number', 0, 0), | ||||||
|         '_span.gridSize'  : (1, 'scalar_number', 0, 0), |         b'_span.gridSize'  : (1, 'scalar_number', 0, 0), | ||||||
|         '_span.gridBottomCenter'  : (1, 'scalar_number', 0, 0), |         b'_span.gridBottomCenter'  : (1, 'scalar_number', 0, 0), | ||||||
|         '_span.gridTopCenter' : (1, 'scalar_number', 0, 0), |         b'_span.gridTopCenter' : (1, 'scalar_number', 0, 0), | ||||||
|         '_span.gridBeginCenter' : (1, 'scalar_number', 0, 0), |         b'_span.gridBeginCenter' : (1, 'scalar_number', 0, 0), | ||||||
|         '_span.gridEndCenter' : (1, 'scalar_number', 0, 0), |         b'_span.gridEndCenter' : (1, 'scalar_number', 0, 0), | ||||||
| 
 | 
 | ||||||
|         'span'           : (1, 'snippets', 1, 0), |         b'span'           : (1, 'snippets', 1, 0), | ||||||
|         'span.firstWord' : (1, 'scalar_number', 0, 0), |         b'span.firstWord' : (1, 'scalar_number', 0, 0), | ||||||
|         'span.lastWord'  : (1, 'scalar_number', 0, 0), |         b'span.lastWord'  : (1, 'scalar_number', 0, 0), | ||||||
|         'span.gridSize'  : (1, 'scalar_number', 0, 0), |         b'span.gridSize'  : (1, 'scalar_number', 0, 0), | ||||||
|         'span.gridBottomCenter'  : (1, 'scalar_number', 0, 0), |         b'span.gridBottomCenter'  : (1, 'scalar_number', 0, 0), | ||||||
|         'span.gridTopCenter' : (1, 'scalar_number', 0, 0), |         b'span.gridTopCenter' : (1, 'scalar_number', 0, 0), | ||||||
|         'span.gridBeginCenter' : (1, 'scalar_number', 0, 0), |         b'span.gridBeginCenter' : (1, 'scalar_number', 0, 0), | ||||||
|         'span.gridEndCenter' : (1, 'scalar_number', 0, 0), |         b'span.gridEndCenter' : (1, 'scalar_number', 0, 0), | ||||||
| 
 | 
 | ||||||
|         'extratokens'                   : (1, 'snippets', 1, 0), |         b'extratokens'                   : (1, 'snippets', 1, 0), | ||||||
|         'extratokens.class'             : (1, 'scalar_text', 0, 0), |         b'extratokens.class'             : (1, 'scalar_text', 0, 0), | ||||||
|         'extratokens.type'              : (1, 'scalar_text', 0, 0), |         b'extratokens.type'              : (1, 'scalar_text', 0, 0), | ||||||
|         'extratokens.firstGlyph'        : (1, 'scalar_number', 0, 0), |         b'extratokens.firstGlyph'        : (1, 'scalar_number', 0, 0), | ||||||
|         'extratokens.lastGlyph'         : (1, 'scalar_number', 0, 0), |         b'extratokens.lastGlyph'         : (1, 'scalar_number', 0, 0), | ||||||
|         'extratokens.gridSize'          : (1, 'scalar_number', 0, 0), |         b'extratokens.gridSize'          : (1, 'scalar_number', 0, 0), | ||||||
|         'extratokens.gridBottomCenter'  : (1, 'scalar_number', 0, 0), |         b'extratokens.gridBottomCenter'  : (1, 'scalar_number', 0, 0), | ||||||
|         'extratokens.gridTopCenter'     : (1, 'scalar_number', 0, 0), |         b'extratokens.gridTopCenter'     : (1, 'scalar_number', 0, 0), | ||||||
|         'extratokens.gridBeginCenter'   : (1, 'scalar_number', 0, 0), |         b'extratokens.gridBeginCenter'   : (1, 'scalar_number', 0, 0), | ||||||
|         'extratokens.gridEndCenter'     : (1, 'scalar_number', 0, 0), |         b'extratokens.gridEndCenter'     : (1, 'scalar_number', 0, 0), | ||||||
| 
 | 
 | ||||||
|         'glyph.h'      : (1, 'number', 0, 0), |         b'glyph.h'      : (1, 'number', 0, 0), | ||||||
|         'glyph.w'      : (1, 'number', 0, 0), |         b'glyph.w'      : (1, 'number', 0, 0), | ||||||
|         'glyph.use'    : (1, 'number', 0, 0), |         b'glyph.use'    : (1, 'number', 0, 0), | ||||||
|         'glyph.vtx'    : (1, 'number', 0, 1), |         b'glyph.vtx'    : (1, 'number', 0, 1), | ||||||
|         'glyph.len'    : (1, 'number', 0, 1), |         b'glyph.len'    : (1, 'number', 0, 1), | ||||||
|         'glyph.dpi'    : (1, 'number', 0, 0), |         b'glyph.dpi'    : (1, 'number', 0, 0), | ||||||
|         'vtx'          : (0, 'number', 1, 1), |         b'vtx'          : (0, 'number', 1, 1), | ||||||
|         'vtx.x'        : (1, 'number', 0, 0), |         b'vtx.x'        : (1, 'number', 0, 0), | ||||||
|         'vtx.y'        : (1, 'number', 0, 0), |         b'vtx.y'        : (1, 'number', 0, 0), | ||||||
|         'len'          : (0, 'number', 1, 1), |         b'len'          : (0, 'number', 1, 1), | ||||||
|         'len.n'        : (1, 'number', 0, 0), |         b'len.n'        : (1, 'number', 0, 0), | ||||||
| 
 | 
 | ||||||
|         'book'         : (1, 'snippets', 1, 0), |         b'book'         : (1, 'snippets', 1, 0), | ||||||
|         'version'      : (1, 'snippets', 1, 0), |         b'version'      : (1, 'snippets', 1, 0), | ||||||
|         'version.FlowEdit_1_id'            : (1, 'scalar_text', 0, 0), |         b'version.FlowEdit_1_id'            : (1, 'scalar_text', 0, 0), | ||||||
|         'version.FlowEdit_1_version'       : (1, 'scalar_text', 0, 0), |         b'version.FlowEdit_1_version'       : (1, 'scalar_text', 0, 0), | ||||||
|         'version.Schema_id'                : (1, 'scalar_text', 0, 0), |         b'version.Schema_id'                : (1, 'scalar_text', 0, 0), | ||||||
|         'version.Schema_version'           : (1, 'scalar_text', 0, 0), |         b'version.Schema_version'           : (1, 'scalar_text', 0, 0), | ||||||
|         'version.Topaz_version'            : (1, 'scalar_text', 0, 0), |         b'version.Topaz_version'            : (1, 'scalar_text', 0, 0), | ||||||
|         'version.WordDetailEdit_1_id'      : (1, 'scalar_text', 0, 0), |         b'version.WordDetailEdit_1_id'      : (1, 'scalar_text', 0, 0), | ||||||
|         'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0), |         b'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0), | ||||||
|         'version.ZoneEdit_1_id'            : (1, 'scalar_text', 0, 0), |         b'version.ZoneEdit_1_id'            : (1, 'scalar_text', 0, 0), | ||||||
|         'version.ZoneEdit_1_version'       : (1, 'scalar_text', 0, 0), |         b'version.ZoneEdit_1_version'       : (1, 'scalar_text', 0, 0), | ||||||
|         'version.chapterheaders'           : (1, 'scalar_text', 0, 0), |         b'version.chapterheaders'           : (1, 'scalar_text', 0, 0), | ||||||
|         'version.creation_date'            : (1, 'scalar_text', 0, 0), |         b'version.creation_date'            : (1, 'scalar_text', 0, 0), | ||||||
|         'version.header_footer'            : (1, 'scalar_text', 0, 0), |         b'version.header_footer'            : (1, 'scalar_text', 0, 0), | ||||||
|         'version.init_from_ocr'            : (1, 'scalar_text', 0, 0), |         b'version.init_from_ocr'            : (1, 'scalar_text', 0, 0), | ||||||
|         'version.letter_insertion'         : (1, 'scalar_text', 0, 0), |         b'version.letter_insertion'         : (1, 'scalar_text', 0, 0), | ||||||
|         'version.xmlinj_convert'           : (1, 'scalar_text', 0, 0), |         b'version.xmlinj_convert'           : (1, 'scalar_text', 0, 0), | ||||||
|         'version.xmlinj_reflow'            : (1, 'scalar_text', 0, 0), |         b'version.xmlinj_reflow'            : (1, 'scalar_text', 0, 0), | ||||||
|         'version.xmlinj_transform'         : (1, 'scalar_text', 0, 0), |         b'version.xmlinj_transform'         : (1, 'scalar_text', 0, 0), | ||||||
|         'version.findlists'                : (1, 'scalar_text', 0, 0), |         b'version.findlists'                : (1, 'scalar_text', 0, 0), | ||||||
|         'version.page_num'                 : (1, 'scalar_text', 0, 0), |         b'version.page_num'                 : (1, 'scalar_text', 0, 0), | ||||||
|         'version.page_type'                : (1, 'scalar_text', 0, 0), |         b'version.page_type'                : (1, 'scalar_text', 0, 0), | ||||||
|         'version.bad_text'                 : (1, 'scalar_text', 0, 0), |         b'version.bad_text'                 : (1, 'scalar_text', 0, 0), | ||||||
|         'version.glyph_mismatch'           : (1, 'scalar_text', 0, 0), |         b'version.glyph_mismatch'           : (1, 'scalar_text', 0, 0), | ||||||
|         'version.margins'                  : (1, 'scalar_text', 0, 0), |         b'version.margins'                  : (1, 'scalar_text', 0, 0), | ||||||
|         'version.staggered_lines'          : (1, 'scalar_text', 0, 0), |         b'version.staggered_lines'          : (1, 'scalar_text', 0, 0), | ||||||
|         'version.paragraph_continuation'   : (1, 'scalar_text', 0, 0), |         b'version.paragraph_continuation'   : (1, 'scalar_text', 0, 0), | ||||||
|         'version.toc'                      : (1, 'scalar_text', 0, 0), |         b'version.toc'                      : (1, 'scalar_text', 0, 0), | ||||||
| 
 | 
 | ||||||
|         'stylesheet'                : (1, 'snippets', 1, 0), |         b'stylesheet'                : (1, 'snippets', 1, 0), | ||||||
|         'style'                     : (1, 'snippets', 1, 0), |         b'style'                     : (1, 'snippets', 1, 0), | ||||||
|         'style._tag'                : (1, 'scalar_text', 0, 0), |         b'style._tag'                : (1, 'scalar_text', 0, 0), | ||||||
|         'style.type'                : (1, 'scalar_text', 0, 0), |         b'style.type'                : (1, 'scalar_text', 0, 0), | ||||||
|         'style._after_type'         : (1, 'scalar_text', 0, 0), |         b'style._after_type'         : (1, 'scalar_text', 0, 0), | ||||||
|         'style._parent_type'        : (1, 'scalar_text', 0, 0), |         b'style._parent_type'        : (1, 'scalar_text', 0, 0), | ||||||
|         'style._after_parent_type'  : (1, 'scalar_text', 0, 0), |         b'style._after_parent_type'  : (1, 'scalar_text', 0, 0), | ||||||
|         'style.class'               : (1, 'scalar_text', 0, 0), |         b'style.class'               : (1, 'scalar_text', 0, 0), | ||||||
|         'style._after_class'        : (1, 'scalar_text', 0, 0), |         b'style._after_class'        : (1, 'scalar_text', 0, 0), | ||||||
|         'rule'                      : (1, 'snippets', 1, 0), |         b'rule'                      : (1, 'snippets', 1, 0), | ||||||
|         'rule.attr'                 : (1, 'scalar_text', 0, 0), |         b'rule.attr'                 : (1, 'scalar_text', 0, 0), | ||||||
|         'rule.value'                : (1, 'scalar_text', 0, 0), |         b'rule.value'                : (1, 'scalar_text', 0, 0), | ||||||
| 
 | 
 | ||||||
|         'original'      : (0, 'number', 1, 1), |         b'original'      : (0, 'number', 1, 1), | ||||||
|         'original.pnum' : (1, 'number', 0, 0), |         b'original.pnum' : (1, 'number', 0, 0), | ||||||
|         'original.pid'  : (1, 'text', 0, 0), |         b'original.pid'  : (1, 'text', 0, 0), | ||||||
|         'pages'        : (0, 'number', 1, 1), |         b'pages'        : (0, 'number', 1, 1), | ||||||
|         'pages.ref'    : (1, 'number', 0, 0), |         b'pages.ref'    : (1, 'number', 0, 0), | ||||||
|         'pages.id'     : (1, 'number', 0, 0), |         b'pages.id'     : (1, 'number', 0, 0), | ||||||
|         'startID'      : (0, 'number', 1, 1), |         b'startID'      : (0, 'number', 1, 1), | ||||||
|         'startID.page' : (1, 'number', 0, 0), |         b'startID.page' : (1, 'number', 0, 0), | ||||||
|         'startID.id'   : (1, 'number', 0, 0), |         b'startID.id'   : (1, 'number', 0, 0), | ||||||
| 
 | 
 | ||||||
|         'median_d'          : (1, 'number', 0, 0), |         b'median_d'          : (1, 'number', 0, 0), | ||||||
|         'median_h'          : (1, 'number', 0, 0), |         b'median_h'          : (1, 'number', 0, 0), | ||||||
|         'median_firsty'     : (1, 'number', 0, 0), |         b'median_firsty'     : (1, 'number', 0, 0), | ||||||
|         'median_lasty'      : (1, 'number', 0, 0), |         b'median_lasty'      : (1, 'number', 0, 0), | ||||||
| 
 | 
 | ||||||
|         'num_footers_maybe' : (1, 'number', 0, 0), |         b'num_footers_maybe' : (1, 'number', 0, 0), | ||||||
|         'num_footers_yes'   : (1, 'number', 0, 0), |         b'num_footers_yes'   : (1, 'number', 0, 0), | ||||||
|         'num_headers_maybe' : (1, 'number', 0, 0), |         b'num_headers_maybe' : (1, 'number', 0, 0), | ||||||
|         'num_headers_yes'   : (1, 'number', 0, 0), |         b'num_headers_yes'   : (1, 'number', 0, 0), | ||||||
| 
 | 
 | ||||||
|         'tracking'          : (1, 'number', 0, 0), |         b'tracking'          : (1, 'number', 0, 0), | ||||||
|         'src'               : (1, 'text', 0, 0), |         b'src'               : (1, 'text', 0, 0), | ||||||
| 
 | 
 | ||||||
|      } |      } | ||||||
| 
 | 
 | ||||||
|  | @ -430,7 +430,7 @@ class PageParser(object): | ||||||
|         cnt = len(self.tagpath) |         cnt = len(self.tagpath) | ||||||
|         if i < cnt : result = self.tagpath[i] |         if i < cnt : result = self.tagpath[i] | ||||||
|         for j in range(i+1, cnt) : |         for j in range(i+1, cnt) : | ||||||
|             result += '.' + self.tagpath[j] |             result += b'.' + self.tagpath[j] | ||||||
|         return result |         return result | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -505,7 +505,7 @@ class PageParser(object): | ||||||
| 
 | 
 | ||||||
|             if (subtags == 1): |             if (subtags == 1): | ||||||
|                 ntags = readEncodedNumber(self.fo) |                 ntags = readEncodedNumber(self.fo) | ||||||
|                 if self.debug : print('subtags: ' + token + ' has ' + str(ntags)) |                 if self.debug : print('subtags: ', token , ' has ' , str(ntags)) | ||||||
|                 for j in range(ntags): |                 for j in range(ntags): | ||||||
|                     val = readEncodedNumber(self.fo) |                     val = readEncodedNumber(self.fo) | ||||||
|                     subtagres.append(self.procToken(self.dict.lookup(val))) |                     subtagres.append(self.procToken(self.dict.lookup(val))) | ||||||
|  | @ -613,7 +613,7 @@ class PageParser(object): | ||||||
|         subtagList = tag[1] |         subtagList = tag[1] | ||||||
|         argtype = tag[2] |         argtype = tag[2] | ||||||
|         argList = tag[3] |         argList = tag[3] | ||||||
|         nname = prefix + '.' + name |         nname = prefix + b'.' + name | ||||||
|         nsubtaglist = [] |         nsubtaglist = [] | ||||||
|         for j in subtagList: |         for j in subtagList: | ||||||
|             nsubtaglist.append(self.updateName(j,prefix)) |             nsubtaglist.append(self.updateName(j,prefix)) | ||||||
|  | @ -662,34 +662,34 @@ class PageParser(object): | ||||||
|         subtagList = node[1] |         subtagList = node[1] | ||||||
|         argtype = node[2] |         argtype = node[2] | ||||||
|         argList = node[3] |         argList = node[3] | ||||||
|         fullpathname = name.split('.') |         fullpathname = name.split(b'.') | ||||||
|         nodename = fullpathname.pop() |         nodename = fullpathname.pop() | ||||||
|         ilvl = len(fullpathname) |         ilvl = len(fullpathname) | ||||||
|         indent = ' ' * (3 * ilvl) |         indent = b' ' * (3 * ilvl) | ||||||
|         rlst = [] |         rlst = [] | ||||||
|         rlst.append(indent + '<' + nodename + '>') |         rlst.append(indent + b'<' + nodename + b'>') | ||||||
|         if len(argList) > 0: |         if len(argList) > 0: | ||||||
|             alst = [] |             alst = [] | ||||||
|             for j in argList: |             for j in argList: | ||||||
|                 if (argtype == 'text') or (argtype == 'scalar_text') : |                 if (argtype == b'text') or (argtype == b'scalar_text') : | ||||||
|                     alst.append(j + '|') |                     alst.append(j + b'|') | ||||||
|                 else : |                 else : | ||||||
|                     alst.append(str(j) + ',') |                     alst.append(str(j).encode('utf-8') + b',') | ||||||
|             argres = "".join(alst) |             argres = b"".join(alst) | ||||||
|             argres = argres[0:-1] |             argres = argres[0:-1] | ||||||
|             if argtype == 'snippets' : |             if argtype == b'snippets' : | ||||||
|                 rlst.append('snippets:' + argres) |                 rlst.append(b'snippets:' + argres) | ||||||
|             else : |             else : | ||||||
|                 rlst.append(argres) |                 rlst.append(argres) | ||||||
|         if len(subtagList) > 0 : |         if len(subtagList) > 0 : | ||||||
|             rlst.append('\n') |             rlst.append(b'\n') | ||||||
|             for j in subtagList: |             for j in subtagList: | ||||||
|                 if len(j) > 0 : |                 if len(j) > 0 : | ||||||
|                     rlst.append(self.formatTag(j)) |                     rlst.append(self.formatTag(j)) | ||||||
|             rlst.append(indent + '</' + nodename + '>\n') |             rlst.append(indent + b'</' + nodename + b'>\n') | ||||||
|         else: |         else: | ||||||
|             rlst.append('</' + nodename + '>\n') |             rlst.append(b'</' + nodename + b'>\n') | ||||||
|         return "".join(rlst) |         return b"".join(rlst) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|     # flatten tag |     # flatten tag | ||||||
|  | @ -704,20 +704,20 @@ class PageParser(object): | ||||||
|             alst = [] |             alst = [] | ||||||
|             for j in argList: |             for j in argList: | ||||||
|                 if (argtype == 'text') or (argtype == 'scalar_text') : |                 if (argtype == 'text') or (argtype == 'scalar_text') : | ||||||
|                     alst.append(j + '|') |                      alst.append(j + b'|') | ||||||
|                 else : |                 else : | ||||||
|                     alst.append(str(j) + '|') |                     alst.append(str(j).encode('utf-8') + b'|') | ||||||
|             argres = "".join(alst) |             argres = b"".join(alst) | ||||||
|             argres = argres[0:-1] |             argres = argres[0:-1] | ||||||
|             if argtype == 'snippets' : |             if argtype == b'snippets' : | ||||||
|                 rlst.append('.snippets=' + argres) |                 rlst.append(b'.snippets=' + argres) | ||||||
|             else : |             else : | ||||||
|                 rlst.append('=' + argres) |                 rlst.append(b'=' + argres) | ||||||
|         rlst.append('\n') |         rlst.append(b'\n') | ||||||
|         for j in subtagList: |         for j in subtagList: | ||||||
|             if len(j) > 0 : |             if len(j) > 0 : | ||||||
|                 rlst.append(self.flattenTag(j)) |                 rlst.append(self.flattenTag(j)) | ||||||
|         return "".join(rlst) |         return b"".join(rlst) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|     # reduce create xml output |     # reduce create xml output | ||||||
|  | @ -729,7 +729,7 @@ class PageParser(object): | ||||||
|                     rlst.append(self.flattenTag(j)) |                     rlst.append(self.flattenTag(j)) | ||||||
|                 else: |                 else: | ||||||
|                     rlst.append(self.formatTag(j)) |                     rlst.append(self.formatTag(j)) | ||||||
|         result = "".join(rlst) |         result = b"".join(rlst) | ||||||
|         if self.debug : print(result) |         if self.debug : print(result) | ||||||
|         return result |         return result | ||||||
| 
 | 
 | ||||||
|  | @ -747,16 +747,16 @@ class PageParser(object): | ||||||
| 
 | 
 | ||||||
|         # peek at the first bytes to see what type of file it is |         # peek at the first bytes to see what type of file it is | ||||||
|         magic = self.fo.read(9) |         magic = self.fo.read(9) | ||||||
|         if (magic[0:1] == 'p') and (magic[2:9] == 'marker_'): |         if (magic[0:1] == b'p') and (magic[2:9] == b'marker_'): | ||||||
|             first_token = 'info' |             first_token = b'info' | ||||||
|         elif (magic[0:1] == 'p') and (magic[2:9] == '__PAGE_'): |         elif (magic[0:1] == b'p') and (magic[2:9] == b'__PAGE_'): | ||||||
|             skip = self.fo.read(2) |             skip = self.fo.read(2) | ||||||
|             first_token = 'info' |             first_token = b'info' | ||||||
|         elif (magic[0:1] == 'p') and (magic[2:8] == '_PAGE_'): |         elif (magic[0:1] == b'p') and (magic[2:8] == b'_PAGE_'): | ||||||
|             first_token = 'info' |             first_token = b'info' | ||||||
|         elif (magic[0:1] == 'g') and (magic[2:9] == '__GLYPH'): |         elif (magic[0:1] == b'g') and (magic[2:9] == b'__GLYPH'): | ||||||
|             skip = self.fo.read(3) |             skip = self.fo.read(3) | ||||||
|             first_token = 'info' |             first_token = b'info' | ||||||
|         else : |         else : | ||||||
|             # other0.dat file |             # other0.dat file | ||||||
|             first_token = None |             first_token = None | ||||||
|  | @ -778,7 +778,7 @@ class PageParser(object): | ||||||
|                 break |                 break | ||||||
| 
 | 
 | ||||||
|             if (v == 0x72): |             if (v == 0x72): | ||||||
|                 self.doLoop72('number') |                 self.doLoop72(b'number') | ||||||
|             elif (v > 0) and (v < self.dict.getSize()) : |             elif (v > 0) and (v < self.dict.getSize()) : | ||||||
|                 tag = self.procToken(self.dict.lookup(v)) |                 tag = self.procToken(self.dict.lookup(v)) | ||||||
|                 if len(tag) > 0 : |                 if len(tag) > 0 : | ||||||
|  | @ -789,7 +789,7 @@ class PageParser(object): | ||||||
|                 if (v == 0): |                 if (v == 0): | ||||||
|                     if (self.peek(1) == 0x5f): |                     if (self.peek(1) == 0x5f): | ||||||
|                         skip = self.fo.read(1) |                         skip = self.fo.read(1) | ||||||
|                         first_token = 'info' |                         first_token = b'info' | ||||||
| 
 | 
 | ||||||
|         # now do snippet injection |         # now do snippet injection | ||||||
|         if len(self.snippetList) > 0 : |         if len(self.snippetList) > 0 : | ||||||
|  | @ -809,14 +809,14 @@ class PageParser(object): | ||||||
| 
 | 
 | ||||||
| def fromData(dict, fname): | def fromData(dict, fname): | ||||||
|     flat_xml = True |     flat_xml = True | ||||||
|     debug = False |     debug = True | ||||||
|     pp = PageParser(fname, dict, debug, flat_xml) |     pp = PageParser(fname, dict, debug, flat_xml) | ||||||
|     xmlpage = pp.process() |     xmlpage = pp.process() | ||||||
|     return xmlpage |     return xmlpage | ||||||
| 
 | 
 | ||||||
| def getXML(dict, fname): | def getXML(dict, fname): | ||||||
|     flat_xml = False |     flat_xml = False | ||||||
|     debug = False |     debug = True | ||||||
|     pp = PageParser(fname, dict, debug, flat_xml) |     pp = PageParser(fname, dict, debug, flat_xml) | ||||||
|     xmlpage = pp.process() |     xmlpage = pp.process() | ||||||
|     return xmlpage |     return xmlpage | ||||||
|  | @ -845,7 +845,7 @@ def main(argv): | ||||||
|     sys.stderr=SafeUnbuffered(sys.stderr) |     sys.stderr=SafeUnbuffered(sys.stderr) | ||||||
|     dictFile = "" |     dictFile = "" | ||||||
|     pageFile = "" |     pageFile = "" | ||||||
|     debug = False |     debug = True | ||||||
|     flat_xml = False |     flat_xml = False | ||||||
|     printOutput = False |     printOutput = False | ||||||
|     if len(argv) == 0: |     if len(argv) == 0: | ||||||
|  |  | ||||||
|  | @ -7,6 +7,7 @@ import csv | ||||||
| import os | import os | ||||||
| import math | import math | ||||||
| import getopt | import getopt | ||||||
|  | import functools | ||||||
| from struct import pack | from struct import pack | ||||||
| from struct import unpack | from struct import unpack | ||||||
| 
 | 
 | ||||||
|  | @ -15,14 +16,14 @@ class DocParser(object): | ||||||
|     def __init__(self, flatxml, classlst, fileid, bookDir, gdict, fixedimage): |     def __init__(self, flatxml, classlst, fileid, bookDir, gdict, fixedimage): | ||||||
|         self.id = os.path.basename(fileid).replace('.dat','') |         self.id = os.path.basename(fileid).replace('.dat','') | ||||||
|         self.svgcount = 0 |         self.svgcount = 0 | ||||||
|         self.docList = flatxml.split('\n') |         self.docList = flatxml.split(b'\n') | ||||||
|         self.docSize = len(self.docList) |         self.docSize = len(self.docList) | ||||||
|         self.classList = {} |         self.classList = {} | ||||||
|         self.bookDir = bookDir |         self.bookDir = bookDir | ||||||
|         self.gdict = gdict |         self.gdict = gdict | ||||||
|         tmpList = classlst.split('\n') |         tmpList = classlst.split('\n') | ||||||
|         for pclass in tmpList: |         for pclass in tmpList: | ||||||
|             if pclass != '': |             if pclass != b'': | ||||||
|                 # remove the leading period from the css name |                 # remove the leading period from the css name | ||||||
|                 cname = pclass[1:] |                 cname = pclass[1:] | ||||||
|             self.classList[cname] = True |             self.classList[cname] = True | ||||||
|  | @ -57,9 +58,9 @@ class DocParser(object): | ||||||
|         imgfile = os.path.join(imgDir,imgname) |         imgfile = os.path.join(imgDir,imgname) | ||||||
| 
 | 
 | ||||||
|         # get glyph information |         # get glyph information | ||||||
|         gxList = self.getData('info.glyph.x',0,-1) |         gxList = self.getData(b'info.glyph.x',0,-1) | ||||||
|         gyList = self.getData('info.glyph.y',0,-1) |         gyList = self.getData(b'info.glyph.y',0,-1) | ||||||
|         gidList = self.getData('info.glyph.glyphID',0,-1) |         gidList = self.getData(b'info.glyph.glyphID',0,-1) | ||||||
| 
 | 
 | ||||||
|         gids = [] |         gids = [] | ||||||
|         maxws = [] |         maxws = [] | ||||||
|  | @ -122,11 +123,11 @@ class DocParser(object): | ||||||
|     def lineinDoc(self, pos) : |     def lineinDoc(self, pos) : | ||||||
|         if (pos >= 0) and (pos < self.docSize) : |         if (pos >= 0) and (pos < self.docSize) : | ||||||
|             item = self.docList[pos] |             item = self.docList[pos] | ||||||
|             if item.find('=') >= 0: |             if item.find(b'=') >= 0: | ||||||
|                 (name, argres) = item.split('=',1) |                 (name, argres) = item.split(b'=',1) | ||||||
|             else : |             else : | ||||||
|                 name = item |                 name = item | ||||||
|                 argres = '' |                 argres = b'' | ||||||
|         return name, argres |         return name, argres | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -140,11 +141,13 @@ class DocParser(object): | ||||||
|         foundat = -1 |         foundat = -1 | ||||||
|         for j in range(pos, end): |         for j in range(pos, end): | ||||||
|             item = self.docList[j] |             item = self.docList[j] | ||||||
|             if item.find('=') >= 0: |             if item.find(b'=') >= 0: | ||||||
|                 (name, argres) = item.split('=',1) |                 (name, argres) = item.split(b'=',1) | ||||||
|             else : |             else : | ||||||
|                 name = item |                 name = item | ||||||
|                 argres = '' |                 argres = '' | ||||||
|  |             if (isinstance(tagpath,str)): | ||||||
|  |                 tagpath = tagpath.encode('utf-8') | ||||||
|             if name.endswith(tagpath) : |             if name.endswith(tagpath) : | ||||||
|                 result = argres |                 result = argres | ||||||
|                 foundat = j |                 foundat = j | ||||||
|  | @ -170,7 +173,7 @@ class DocParser(object): | ||||||
|         argres=[] |         argres=[] | ||||||
|         (foundat, argt) = self.findinDoc(tagpath, pos, end) |         (foundat, argt) = self.findinDoc(tagpath, pos, end) | ||||||
|         if (argt != None) and (len(argt) > 0) : |         if (argt != None) and (len(argt) > 0) : | ||||||
|             argList = argt.split('|') |             argList = argt.split(b'|') | ||||||
|             argres = [ int(strval) for strval in argList] |             argres = [ int(strval) for strval in argList] | ||||||
|         return argres |         return argres | ||||||
| 
 | 
 | ||||||
|  | @ -191,21 +194,21 @@ class DocParser(object): | ||||||
| 
 | 
 | ||||||
|         # also some class names have spaces in them so need to convert to dashes |         # also some class names have spaces in them so need to convert to dashes | ||||||
|         if nclass != None : |         if nclass != None : | ||||||
|             nclass = nclass.replace(' ','-') |             nclass = nclass.replace(b' ',b'-') | ||||||
|             classres = '' |             classres = b'' | ||||||
|             nclass = nclass.lower() |             nclass = nclass.lower() | ||||||
|             nclass = 'cl-' + nclass |             nclass = b'cl-' + nclass | ||||||
|             baseclass = '' |             baseclass = b'' | ||||||
|             # graphic is the base class for captions |             # graphic is the base class for captions | ||||||
|             if nclass.find('cl-cap-') >=0 : |             if nclass.find(b'cl-cap-') >=0 : | ||||||
|                 classres = 'graphic' + ' ' |                 classres = b'graphic' + b' ' | ||||||
|             else : |             else : | ||||||
|                 # strip to find baseclass |                 # strip to find baseclass | ||||||
|                 p = nclass.find('_') |                 p = nclass.find(b'_') | ||||||
|                 if p > 0 : |                 if p > 0 : | ||||||
|                     baseclass = nclass[0:p] |                     baseclass = nclass[0:p] | ||||||
|                     if baseclass in self.classList: |                     if baseclass in self.classList: | ||||||
|                         classres += baseclass + ' ' |                         classres += baseclass + b' ' | ||||||
|             classres += nclass |             classres += nclass | ||||||
|             nclass = classres |             nclass = classres | ||||||
|         return nclass |         return nclass | ||||||
|  | @ -225,11 +228,11 @@ class DocParser(object): | ||||||
|             return -1 |             return -1 | ||||||
| 
 | 
 | ||||||
|         result = [] |         result = [] | ||||||
|         (pos, pagetype) = self.findinDoc('page.type',0,-1) |         (pos, pagetype) = self.findinDoc(b'page.type',0,-1) | ||||||
| 
 | 
 | ||||||
|         groupList = self.posinDoc('page.group') |         groupList = self.posinDoc(b'page.group') | ||||||
|         groupregionList = self.posinDoc('page.group.region') |         groupregionList = self.posinDoc(b'page.group.region') | ||||||
|         pageregionList = self.posinDoc('page.region') |         pageregionList = self.posinDoc(b'page.region') | ||||||
|         # integrate into one list |         # integrate into one list | ||||||
|         for j in groupList: |         for j in groupList: | ||||||
|             result.append(('grpbeg',j)) |             result.append(('grpbeg',j)) | ||||||
|  | @ -237,7 +240,7 @@ class DocParser(object): | ||||||
|             result.append(('gregion',j)) |             result.append(('gregion',j)) | ||||||
|         for j in pageregionList: |         for j in pageregionList: | ||||||
|             result.append(('pregion',j)) |             result.append(('pregion',j)) | ||||||
|         result.sort(compare) |         result.sort(key=functools.cmp_to_key(compare)) | ||||||
| 
 | 
 | ||||||
|         # insert group end and page end indicators |         # insert group end and page end indicators | ||||||
|         inGroup = False |         inGroup = False | ||||||
|  | @ -267,33 +270,33 @@ class DocParser(object): | ||||||
|         result = [] |         result = [] | ||||||
| 
 | 
 | ||||||
|         # paragraph |         # paragraph | ||||||
|         (pos, pclass) = self.findinDoc('paragraph.class',start,end) |         (pos, pclass) = self.findinDoc(b'paragraph.class',start,end) | ||||||
| 
 | 
 | ||||||
|         pclass = self.getClass(pclass) |         pclass = self.getClass(pclass) | ||||||
| 
 | 
 | ||||||
|         # if paragraph uses extratokens (extra glyphs) then make it fixed |         # if paragraph uses extratokens (extra glyphs) then make it fixed | ||||||
|         (pos, extraglyphs) = self.findinDoc('paragraph.extratokens',start,end) |         (pos, extraglyphs) = self.findinDoc(b'paragraph.extratokens',start,end) | ||||||
| 
 | 
 | ||||||
|         # build up a description of the paragraph in result and return it |         # build up a description of the paragraph in result and return it | ||||||
|         # first check for the  basic - all words paragraph |         # first check for the  basic - all words paragraph | ||||||
|         (pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end) |         (pos, sfirst) = self.findinDoc(b'paragraph.firstWord',start,end) | ||||||
|         (pos, slast) = self.findinDoc('paragraph.lastWord',start,end) |         (pos, slast) = self.findinDoc(b'paragraph.lastWord',start,end) | ||||||
|         if (sfirst != None) and (slast != None) : |         if (sfirst != None) and (slast != None) : | ||||||
|             first = int(sfirst) |             first = int(sfirst) | ||||||
|             last = int(slast) |             last = int(slast) | ||||||
| 
 | 
 | ||||||
|             makeImage = (regtype == 'vertical') or (regtype == 'table') |             makeImage = (regtype == b'vertical') or (regtype == b'table') | ||||||
|             makeImage = makeImage or (extraglyphs != None) |             makeImage = makeImage or (extraglyphs != None) | ||||||
|             if self.fixedimage: |             if self.fixedimage: | ||||||
|                 makeImage = makeImage or (regtype == 'fixed') |                 makeImage = makeImage or (regtype == b'fixed') | ||||||
| 
 | 
 | ||||||
|             if (pclass != None): |             if (pclass != None): | ||||||
|                 makeImage = makeImage or (pclass.find('.inverted') >= 0) |                 makeImage = makeImage or (pclass.find(b'.inverted') >= 0) | ||||||
|                 if self.fixedimage : |                 if self.fixedimage : | ||||||
|                     makeImage = makeImage or (pclass.find('cl-f-') >= 0) |                     makeImage = makeImage or (pclass.find(b'cl-f-') >= 0) | ||||||
| 
 | 
 | ||||||
|             # before creating an image make sure glyph info exists |             # before creating an image make sure glyph info exists | ||||||
|             gidList = self.getData('info.glyph.glyphID',0,-1) |             gidList = self.getData(b'info.glyph.glyphID',0,-1) | ||||||
| 
 | 
 | ||||||
|             makeImage = makeImage & (len(gidList) > 0) |             makeImage = makeImage & (len(gidList) > 0) | ||||||
| 
 | 
 | ||||||
|  | @ -307,8 +310,8 @@ class DocParser(object): | ||||||
|             # translate first and last word into first and last glyphs |             # translate first and last word into first and last glyphs | ||||||
|             # and generate inline image and include it |             # and generate inline image and include it | ||||||
|             glyphList = [] |             glyphList = [] | ||||||
|             firstglyphList = self.getData('word.firstGlyph',0,-1) |             firstglyphList = self.getData(b'word.firstGlyph',0,-1) | ||||||
|             gidList = self.getData('info.glyph.glyphID',0,-1) |             gidList = self.getData(b'info.glyph.glyphID',0,-1) | ||||||
|             firstGlyph = firstglyphList[first] |             firstGlyph = firstglyphList[first] | ||||||
|             if last < len(firstglyphList): |             if last < len(firstglyphList): | ||||||
|                 lastGlyph = firstglyphList[last] |                 lastGlyph = firstglyphList[last] | ||||||
|  | @ -326,8 +329,8 @@ class DocParser(object): | ||||||
|             for glyphnum in range(firstGlyph, lastGlyph): |             for glyphnum in range(firstGlyph, lastGlyph): | ||||||
|                 glyphList.append(glyphnum) |                 glyphList.append(glyphnum) | ||||||
|             # include any extratokens if they exist |             # include any extratokens if they exist | ||||||
|             (pos, sfg) = self.findinDoc('extratokens.firstGlyph',start,end) |             (pos, sfg) = self.findinDoc(b'extratokens.firstGlyph',start,end) | ||||||
|             (pos, slg) = self.findinDoc('extratokens.lastGlyph',start,end) |             (pos, slg) = self.findinDoc(b'extratokens.lastGlyph',start,end) | ||||||
|             if (sfg != None) and (slg != None): |             if (sfg != None) and (slg != None): | ||||||
|                 for glyphnum in range(int(sfg), int(slg)): |                 for glyphnum in range(int(sfg), int(slg)): | ||||||
|                     glyphList.append(glyphnum) |                     glyphList.append(glyphnum) | ||||||
|  | @ -368,39 +371,39 @@ class DocParser(object): | ||||||
| 
 | 
 | ||||||
|             (name, argres) = self.lineinDoc(line) |             (name, argres) = self.lineinDoc(line) | ||||||
| 
 | 
 | ||||||
|             if name.endswith('span.firstWord') : |             if name.endswith(b'span.firstWord') : | ||||||
|                 sp_first = int(argres) |                 sp_first = int(argres) | ||||||
| 
 | 
 | ||||||
|             elif name.endswith('span.lastWord') : |             elif name.endswith(b'span.lastWord') : | ||||||
|                 sp_last = int(argres) |                 sp_last = int(argres) | ||||||
| 
 | 
 | ||||||
|             elif name.endswith('word.firstGlyph') : |             elif name.endswith(b'word.firstGlyph') : | ||||||
|                 gl_first = int(argres) |                 gl_first = int(argres) | ||||||
| 
 | 
 | ||||||
|             elif name.endswith('word.lastGlyph') : |             elif name.endswith(b'word.lastGlyph') : | ||||||
|                 gl_last = int(argres) |                 gl_last = int(argres) | ||||||
| 
 | 
 | ||||||
|             elif name.endswith('word_semantic.firstWord'): |             elif name.endswith(b'word_semantic.firstWord'): | ||||||
|                 ws_first = int(argres) |                 ws_first = int(argres) | ||||||
| 
 | 
 | ||||||
|             elif name.endswith('word_semantic.lastWord'): |             elif name.endswith(b'word_semantic.lastWord'): | ||||||
|                 ws_last = int(argres) |                 ws_last = int(argres) | ||||||
| 
 | 
 | ||||||
|             elif name.endswith('word.class'): |             elif name.endswith(b'word.class'): | ||||||
|                 # we only handle spaceafter word class |                 # we only handle spaceafter word class | ||||||
|                 try: |                 try: | ||||||
|                     (cname, space) = argres.split('-',1) |                     (cname, space) = argres.split(b'-',1) | ||||||
|                     if space == '' : space = '0' |                     if space == b'' : space = b'0' | ||||||
|                     if (cname == 'spaceafter') and (int(space) > 0) : |                     if (cname == b'spaceafter') and (int(space) > 0) : | ||||||
|                         word_class = 'sa' |                         word_class = 'sa' | ||||||
|                 except: |                 except: | ||||||
|                     pass |                     pass | ||||||
| 
 | 
 | ||||||
|             elif name.endswith('word.img.src'): |             elif name.endswith(b'word.img.src'): | ||||||
|                 result.append(('img' + word_class, int(argres))) |                 result.append(('img' + word_class, int(argres))) | ||||||
|                 word_class = '' |                 word_class = '' | ||||||
| 
 | 
 | ||||||
|             elif name.endswith('region.img.src'): |             elif name.endswith(b'region.img.src'): | ||||||
|                 result.append(('img' + word_class, int(argres))) |                 result.append(('img' + word_class, int(argres))) | ||||||
| 
 | 
 | ||||||
|             if (sp_first != -1) and (sp_last != -1): |             if (sp_first != -1) and (sp_last != -1): | ||||||
|  | @ -437,7 +440,7 @@ class DocParser(object): | ||||||
| 
 | 
 | ||||||
|         classres = '' |         classres = '' | ||||||
|         if pclass : |         if pclass : | ||||||
|             classres = ' class="' + pclass + '"' |             classres = ' class="' + pclass.decode('utf-8') + '"' | ||||||
| 
 | 
 | ||||||
|         br_lb = (regtype == 'fixed') or (regtype == 'chapterheading') or (regtype == 'vertical') |         br_lb = (regtype == 'fixed') or (regtype == 'chapterheading') or (regtype == 'vertical') | ||||||
| 
 | 
 | ||||||
|  | @ -470,8 +473,8 @@ class DocParser(object): | ||||||
|                     if (link > 0): |                     if (link > 0): | ||||||
|                         linktype = self.link_type[link-1] |                         linktype = self.link_type[link-1] | ||||||
|                         title = self.link_title[link-1] |                         title = self.link_title[link-1] | ||||||
|                         if (title == "") or (parares.rfind(title) < 0): |                         if (title == b"") or (parares.rfind(title.decode('utf-8')) < 0): | ||||||
|                             title=parares[lstart:] |                             title=parares[lstart:].encode('utf-8') | ||||||
|                         if linktype == 'external' : |                         if linktype == 'external' : | ||||||
|                             linkhref = self.link_href[link-1] |                             linkhref = self.link_href[link-1] | ||||||
|                             linkhtml = '<a href="%s">' % linkhref |                             linkhtml = '<a href="%s">' % linkhref | ||||||
|  | @ -482,33 +485,34 @@ class DocParser(object): | ||||||
|                             else : |                             else : | ||||||
|                                 # just link to the current page |                                 # just link to the current page | ||||||
|                                 linkhtml = '<a href="#' + self.id + '">' |                                 linkhtml = '<a href="#' + self.id + '">' | ||||||
|                         linkhtml += title + '</a>' |                         linkhtml += title.decode('utf-8') | ||||||
|                         pos = parares.rfind(title) |                         linkhtml += '</a>' | ||||||
|  |                         pos = parares.rfind(title.decode('utf-8')) | ||||||
|                         if pos >= 0: |                         if pos >= 0: | ||||||
|                             parares = parares[0:pos] + linkhtml + parares[pos+len(title):] |                             parares = parares[0:pos] + linkhtml + parares[pos+len(title):] | ||||||
|                         else : |                         else : | ||||||
|                             parares += linkhtml |                             parares += linkhtml | ||||||
|                         lstart = len(parares) |                         lstart = len(parares) | ||||||
|                         if word == '_link_' : word = '' |                         if word == b'_link_' : word = b'' | ||||||
|                     elif (link < 0) : |                     elif (link < 0) : | ||||||
|                         if word == '_link_' : word = '' |                         if word == b'_link_' : word = b'' | ||||||
| 
 | 
 | ||||||
|                 if word == '_lb_': |                 if word == b'_lb_': | ||||||
|                     if ((num-1) in self.dehyphen_rootid ) or handle_links: |                     if ((num-1) in self.dehyphen_rootid ) or handle_links: | ||||||
|                         word = '' |                         word = b'' | ||||||
|                         sep = '' |                         sep = '' | ||||||
|                     elif br_lb : |                     elif br_lb : | ||||||
|                         word = '<br />\n' |                         word = b'<br />\n' | ||||||
|                         sep = '' |                         sep = '' | ||||||
|                     else : |                     else : | ||||||
|                         word = '\n' |                         word = b'\n' | ||||||
|                         sep = '' |                         sep = '' | ||||||
| 
 | 
 | ||||||
|                 if num in self.dehyphen_rootid : |                 if num in self.dehyphen_rootid : | ||||||
|                     word = word[0:-1] |                     word = word[0:-1] | ||||||
|                     sep = '' |                     sep = '' | ||||||
| 
 | 
 | ||||||
|                 parares += word + sep |                 parares += word.decode('utf-8') + sep | ||||||
| 
 | 
 | ||||||
|             elif wtype == 'img' : |             elif wtype == 'img' : | ||||||
|                 sep = '' |                 sep = '' | ||||||
|  | @ -522,7 +526,9 @@ class DocParser(object): | ||||||
| 
 | 
 | ||||||
|             elif wtype == 'svg' : |             elif wtype == 'svg' : | ||||||
|                 sep = '' |                 sep = '' | ||||||
|                 parares += '<img src="img/' + self.id + '_%04d.svg" alt="" />' % num |                 parares += '<img src="img/' | ||||||
|  |                 parares += self.id | ||||||
|  |                 parares += '_%04d.svg" alt="" />' % num | ||||||
|                 parares += sep |                 parares += sep | ||||||
| 
 | 
 | ||||||
|         if len(sep) > 0 : parares = parares[0:-1] |         if len(sep) > 0 : parares = parares[0:-1] | ||||||
|  | @ -545,7 +551,7 @@ class DocParser(object): | ||||||
|             (wtype, num) = pdesc[j] |             (wtype, num) = pdesc[j] | ||||||
| 
 | 
 | ||||||
|             if wtype == 'ocr' : |             if wtype == 'ocr' : | ||||||
|                 word = self.ocrtext[num] |                 word = self.ocrtext[num].decode('utf-8') | ||||||
|                 sep = ' ' |                 sep = ' ' | ||||||
| 
 | 
 | ||||||
|                 if handle_links: |                 if handle_links: | ||||||
|  | @ -553,7 +559,7 @@ class DocParser(object): | ||||||
|                     if (link > 0): |                     if (link > 0): | ||||||
|                         linktype = self.link_type[link-1] |                         linktype = self.link_type[link-1] | ||||||
|                         title = self.link_title[link-1] |                         title = self.link_title[link-1] | ||||||
|                         title = title.rstrip('. ') |                         title = title.rstrip(b'. ') | ||||||
|                         alt_title = parares[lstart:] |                         alt_title = parares[lstart:] | ||||||
|                         alt_title = alt_title.strip() |                         alt_title = alt_title.strip() | ||||||
|                         # now strip off the actual printed page number |                         # now strip off the actual printed page number | ||||||
|  | @ -607,38 +613,38 @@ class DocParser(object): | ||||||
|         hlst = [] |         hlst = [] | ||||||
| 
 | 
 | ||||||
|         # get the ocr text |         # get the ocr text | ||||||
|         (pos, argres) = self.findinDoc('info.word.ocrText',0,-1) |         (pos, argres) = self.findinDoc(b'info.word.ocrText',0,-1) | ||||||
|         if argres :  self.ocrtext = argres.split('|') |         if argres :  self.ocrtext = argres.split(b'|') | ||||||
| 
 | 
 | ||||||
|         # get information to dehyphenate the text |         # get information to dehyphenate the text | ||||||
|         self.dehyphen_rootid = self.getData('info.dehyphen.rootID',0,-1) |         self.dehyphen_rootid = self.getData(b'info.dehyphen.rootID',0,-1) | ||||||
| 
 | 
 | ||||||
|         # determine if first paragraph is continued from previous page |         # determine if first paragraph is continued from previous page | ||||||
|         (pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1) |         (pos, self.parastems_stemid) = self.findinDoc(b'info.paraStems.stemID',0,-1) | ||||||
|         first_para_continued = (self.parastems_stemid  != None) |         first_para_continued = (self.parastems_stemid  != None) | ||||||
| 
 | 
 | ||||||
|         # determine if last paragraph is continued onto the next page |         # determine if last paragraph is continued onto the next page | ||||||
|         (pos, self.paracont_stemid) = self.findinDoc('info.paraCont.stemID',0,-1) |         (pos, self.paracont_stemid) = self.findinDoc(b'info.paraCont.stemID',0,-1) | ||||||
|         last_para_continued = (self.paracont_stemid != None) |         last_para_continued = (self.paracont_stemid != None) | ||||||
| 
 | 
 | ||||||
|         # collect link ids |         # collect link ids | ||||||
|         self.link_id = self.getData('info.word.link_id',0,-1) |         self.link_id = self.getData(b'info.word.link_id',0,-1) | ||||||
| 
 | 
 | ||||||
|         # collect link destination page numbers |         # collect link destination page numbers | ||||||
|         self.link_page = self.getData('info.links.page',0,-1) |         self.link_page = self.getData(b'info.links.page',0,-1) | ||||||
| 
 | 
 | ||||||
|         # collect link types (container versus external) |         # collect link types (container versus external) | ||||||
|         (pos, argres) = self.findinDoc('info.links.type',0,-1) |         (pos, argres) = self.findinDoc(b'info.links.type',0,-1) | ||||||
|         if argres :  self.link_type = argres.split('|') |         if argres :  self.link_type = argres.split(b'|') | ||||||
| 
 | 
 | ||||||
|         # collect link destinations |         # collect link destinations | ||||||
|         (pos, argres) = self.findinDoc('info.links.href',0,-1) |         (pos, argres) = self.findinDoc(b'info.links.href',0,-1) | ||||||
|         if argres :  self.link_href = argres.split('|') |         if argres :  self.link_href = argres.split(b'|') | ||||||
| 
 | 
 | ||||||
|         # collect link titles |         # collect link titles | ||||||
|         (pos, argres) = self.findinDoc('info.links.title',0,-1) |         (pos, argres) = self.findinDoc(b'info.links.title',0,-1) | ||||||
|         if argres : |         if argres : | ||||||
|             self.link_title = argres.split('|') |             self.link_title = argres.split(b'|') | ||||||
|         else: |         else: | ||||||
|             self.link_title.append('') |             self.link_title.append('') | ||||||
| 
 | 
 | ||||||
|  | @ -662,51 +668,51 @@ class DocParser(object): | ||||||
|             # set anchor for link target on this page |             # set anchor for link target on this page | ||||||
|             if not anchorSet and not first_para_continued: |             if not anchorSet and not first_para_continued: | ||||||
|                 hlst.append('<div style="visibility: hidden; height: 0; width: 0;" id="') |                 hlst.append('<div style="visibility: hidden; height: 0; width: 0;" id="') | ||||||
|                 hlst.append(self.id + '" title="pagetype_' + pagetype + '"></div>\n') |                 hlst.append(self.id + '" title="pagetype_' + pagetype.decode('utf-8') + '"></div>\n') | ||||||
|                 anchorSet = True |                 anchorSet = True | ||||||
| 
 | 
 | ||||||
|             # handle groups of graphics with text captions |             # handle groups of graphics with text captions | ||||||
|             if (etype == 'grpbeg'): |             if (etype == b'grpbeg'): | ||||||
|                 (pos, grptype) = self.findinDoc('group.type', start, end) |                 (pos, grptype) = self.findinDoc(b'group.type', start, end) | ||||||
|                 if grptype != None: |                 if grptype != None: | ||||||
|                     if grptype == 'graphic': |                     if grptype == b'graphic': | ||||||
|                         gcstr = ' class="' + grptype + '"' |                         gcstr = ' class="' + grptype.decode('utf-8') + '"' | ||||||
|                         hlst.append('<div' + gcstr + '>') |                         hlst.append('<div' + gcstr + '>') | ||||||
|                         inGroup = True |                         inGroup = True | ||||||
| 
 | 
 | ||||||
|             elif (etype == 'grpend'): |             elif (etype == b'grpend'): | ||||||
|                 if inGroup: |                 if inGroup: | ||||||
|                     hlst.append('</div>\n') |                     hlst.append('</div>\n') | ||||||
|                     inGroup = False |                     inGroup = False | ||||||
| 
 | 
 | ||||||
|             else: |             else: | ||||||
|                 (pos, regtype) = self.findinDoc('region.type',start,end) |                 (pos, regtype) = self.findinDoc(b'region.type',start,end) | ||||||
| 
 | 
 | ||||||
|                 if regtype == 'graphic' : |                 if regtype == b'graphic' : | ||||||
|                     (pos, simgsrc) = self.findinDoc('img.src',start,end) |                     (pos, simgsrc) = self.findinDoc(b'img.src',start,end) | ||||||
|                     if simgsrc: |                     if simgsrc: | ||||||
|                         if inGroup: |                         if inGroup: | ||||||
|                             hlst.append('<img src="img/img%04d.jpg" alt="" />' % int(simgsrc)) |                             hlst.append('<img src="img/img%04d.jpg" alt="" />' % int(simgsrc)) | ||||||
|                         else: |                         else: | ||||||
|                             hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)) |                             hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)) | ||||||
| 
 | 
 | ||||||
|                 elif regtype == 'chapterheading' : |                 elif regtype == b'chapterheading' : | ||||||
|                     (pclass, pdesc) = self.getParaDescription(start,end, regtype) |                     (pclass, pdesc) = self.getParaDescription(start,end, regtype) | ||||||
|                     if not breakSet: |                     if not breakSet: | ||||||
|                         hlst.append('<div style="page-break-after: always;"> </div>\n') |                         hlst.append('<div style="page-break-after: always;"> </div>\n') | ||||||
|                         breakSet = True |                         breakSet = True | ||||||
|                     tag = 'h1' |                     tag = 'h1' | ||||||
|                     if pclass and (len(pclass) >= 7): |                     if pclass and (len(pclass) >= 7): | ||||||
|                         if pclass[3:7] == 'ch1-' : tag = 'h1' |                         if pclass[3:7] == b'ch1-' : tag = 'h1' | ||||||
|                         if pclass[3:7] == 'ch2-' : tag = 'h2' |                         if pclass[3:7] == b'ch2-' : tag = 'h2' | ||||||
|                         if pclass[3:7] == 'ch3-' : tag = 'h3' |                         if pclass[3:7] == b'ch3-' : tag = 'h3' | ||||||
|                         hlst.append('<' + tag + ' class="' + pclass + '">') |                         hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">') | ||||||
|                     else: |                     else: | ||||||
|                         hlst.append('<' + tag + '>') |                         hlst.append('<' + tag + '>') | ||||||
|                     hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype)) |                     hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype)) | ||||||
|                     hlst.append('</' + tag + '>') |                     hlst.append('</' + tag + '>') | ||||||
| 
 | 
 | ||||||
|                 elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem'): |                 elif (regtype == b'text') or (regtype == b'fixed') or (regtype == b'insert') or (regtype == b'listitem'): | ||||||
|                     ptype = 'full' |                     ptype = 'full' | ||||||
|                     # check to see if this is a continution from the previous page |                     # check to see if this is a continution from the previous page | ||||||
|                     if first_para_continued : |                     if first_para_continued : | ||||||
|  | @ -715,16 +721,16 @@ class DocParser(object): | ||||||
|                     (pclass, pdesc) = self.getParaDescription(start,end, regtype) |                     (pclass, pdesc) = self.getParaDescription(start,end, regtype) | ||||||
|                     if pclass and (len(pclass) >= 6) and (ptype == 'full'): |                     if pclass and (len(pclass) >= 6) and (ptype == 'full'): | ||||||
|                         tag = 'p' |                         tag = 'p' | ||||||
|                         if pclass[3:6] == 'h1-' : tag = 'h4' |                         if pclass[3:6] == b'h1-' : tag = 'h4' | ||||||
|                         if pclass[3:6] == 'h2-' : tag = 'h5' |                         if pclass[3:6] == b'h2-' : tag = 'h5' | ||||||
|                         if pclass[3:6] == 'h3-' : tag = 'h6' |                         if pclass[3:6] == b'h3-' : tag = 'h6' | ||||||
|                         hlst.append('<' + tag + ' class="' + pclass + '">') |                         hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">') | ||||||
|                         hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype)) |                         hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype)) | ||||||
|                         hlst.append('</' + tag + '>') |                         hlst.append('</' + tag + '>') | ||||||
|                     else : |                     else : | ||||||
|                         hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype)) |                         hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype)) | ||||||
| 
 | 
 | ||||||
|                 elif (regtype == 'tocentry') : |                 elif (regtype == b'tocentry') : | ||||||
|                     ptype = 'full' |                     ptype = 'full' | ||||||
|                     if first_para_continued : |                     if first_para_continued : | ||||||
|                         ptype = 'end' |                         ptype = 'end' | ||||||
|  | @ -733,7 +739,7 @@ class DocParser(object): | ||||||
|                     tocinfo += self.buildTOCEntry(pdesc) |                     tocinfo += self.buildTOCEntry(pdesc) | ||||||
|                     hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype)) |                     hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype)) | ||||||
| 
 | 
 | ||||||
|                 elif (regtype == 'vertical') or (regtype == 'table') : |                 elif (regtype == b'vertical') or (regtype == b'table') : | ||||||
|                     ptype = 'full' |                     ptype = 'full' | ||||||
|                     if inGroup: |                     if inGroup: | ||||||
|                         ptype = 'middle' |                         ptype = 'middle' | ||||||
|  | @ -744,19 +750,19 @@ class DocParser(object): | ||||||
|                     hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype)) |                     hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|                 elif (regtype == 'synth_fcvr.center'): |                 elif (regtype == b'synth_fcvr.center'): | ||||||
|                     (pos, simgsrc) = self.findinDoc('img.src',start,end) |                     (pos, simgsrc) = self.findinDoc(b'img.src',start,end) | ||||||
|                     if simgsrc: |                     if simgsrc: | ||||||
|                         hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)) |                         hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)) | ||||||
| 
 | 
 | ||||||
|                 else : |                 else : | ||||||
|                     print('          Making region type', regtype, end=' ') |                     print('          Making region type', regtype, end=' ') | ||||||
|                     (pos, temp) = self.findinDoc('paragraph',start,end) |                     (pos, temp) = self.findinDoc(b'paragraph',start,end) | ||||||
|                     (pos2, temp) = self.findinDoc('span',start,end) |                     (pos2, temp) = self.findinDoc(b'span',start,end) | ||||||
|                     if pos != -1 or pos2 != -1: |                     if pos != -1 or pos2 != -1: | ||||||
|                         print(' a "text" region') |                         print(' a "text" region') | ||||||
|                         orig_regtype = regtype |                         orig_regtype = regtype | ||||||
|                         regtype = 'fixed' |                         regtype = b'fixed' | ||||||
|                         ptype = 'full' |                         ptype = 'full' | ||||||
|                         # check to see if this is a continution from the previous page |                         # check to see if this is a continution from the previous page | ||||||
|                         if first_para_continued : |                         if first_para_continued : | ||||||
|  | @ -764,23 +770,23 @@ class DocParser(object): | ||||||
|                             first_para_continued = False |                             first_para_continued = False | ||||||
|                         (pclass, pdesc) = self.getParaDescription(start,end, regtype) |                         (pclass, pdesc) = self.getParaDescription(start,end, regtype) | ||||||
|                         if not pclass: |                         if not pclass: | ||||||
|                             if orig_regtype.endswith('.right')     : pclass = 'cl-right' |                             if orig_regtype.endswith(b'.right')     : pclass = 'cl-right' | ||||||
|                             elif orig_regtype.endswith('.center')  : pclass = 'cl-center' |                             elif orig_regtype.endswith(b'.center')  : pclass = 'cl-center' | ||||||
|                             elif orig_regtype.endswith('.left')    : pclass = 'cl-left' |                             elif orig_regtype.endswith(b'.left')    : pclass = 'cl-left' | ||||||
|                             elif orig_regtype.endswith('.justify') : pclass = 'cl-justify' |                             elif orig_regtype.endswith(b'.justify') : pclass = 'cl-justify' | ||||||
|                         if pclass and (ptype == 'full') and (len(pclass) >= 6): |                         if pclass and (ptype == 'full') and (len(pclass) >= 6): | ||||||
|                             tag = 'p' |                             tag = 'p' | ||||||
|                             if pclass[3:6] == 'h1-' : tag = 'h4' |                             if pclass[3:6] == b'h1-' : tag = 'h4' | ||||||
|                             if pclass[3:6] == 'h2-' : tag = 'h5' |                             if pclass[3:6] == b'h2-' : tag = 'h5' | ||||||
|                             if pclass[3:6] == 'h3-' : tag = 'h6' |                             if pclass[3:6] == b'h3-' : tag = 'h6' | ||||||
|                             hlst.append('<' + tag + ' class="' + pclass + '">') |                             hlst.append('<' + tag + ' class="' + pclass.decode('utf-8') + '">') | ||||||
|                             hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype)) |                             hlst.append(self.buildParagraph(pclass, pdesc, 'middle', regtype)) | ||||||
|                             hlst.append('</' + tag + '>') |                             hlst.append('</' + tag + '>') | ||||||
|                         else : |                         else : | ||||||
|                             hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype)) |                             hlst.append(self.buildParagraph(pclass, pdesc, ptype, regtype)) | ||||||
|                     else : |                     else : | ||||||
|                         print(' a "graphic" region') |                         print(' a "graphic" region') | ||||||
|                         (pos, simgsrc) = self.findinDoc('img.src',start,end) |                         (pos, simgsrc) = self.findinDoc(b'img.src',start,end) | ||||||
|                         if simgsrc: |                         if simgsrc: | ||||||
|                             hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)) |                             hlst.append('<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -12,7 +12,7 @@ from struct import unpack | ||||||
| class PParser(object): | class PParser(object): | ||||||
|     def __init__(self, gd, flatxml, meta_array): |     def __init__(self, gd, flatxml, meta_array): | ||||||
|         self.gd = gd |         self.gd = gd | ||||||
|         self.flatdoc = flatxml.split('\n') |         self.flatdoc = flatxml.split(b'\n') | ||||||
|         self.docSize = len(self.flatdoc) |         self.docSize = len(self.flatdoc) | ||||||
|         self.temp = [] |         self.temp = [] | ||||||
| 
 | 
 | ||||||
|  | @ -58,11 +58,11 @@ class PParser(object): | ||||||
|     def lineinDoc(self, pos) : |     def lineinDoc(self, pos) : | ||||||
|         if (pos >= 0) and (pos < self.docSize) : |         if (pos >= 0) and (pos < self.docSize) : | ||||||
|             item = self.flatdoc[pos] |             item = self.flatdoc[pos] | ||||||
|             if item.find('=') >= 0: |             if item.find(b'=') >= 0: | ||||||
|                 (name, argres) = item.split('=',1) |                 (name, argres) = item.split(b'=',1) | ||||||
|             else : |             else : | ||||||
|                 name = item |                 name = item | ||||||
|                 argres = '' |                 argres = b'' | ||||||
|         return name, argres |         return name, argres | ||||||
| 
 | 
 | ||||||
|     # find tag in doc if within pos to end inclusive |     # find tag in doc if within pos to end inclusive | ||||||
|  | @ -75,11 +75,13 @@ class PParser(object): | ||||||
|         foundat = -1 |         foundat = -1 | ||||||
|         for j in range(pos, end): |         for j in range(pos, end): | ||||||
|             item = self.flatdoc[j] |             item = self.flatdoc[j] | ||||||
|             if item.find('=') >= 0: |             if item.find(b'=') >= 0: | ||||||
|                 (name, argres) = item.split('=',1) |                 (name, argres) = item.split(b'=',1) | ||||||
|             else : |             else : | ||||||
|                 name = item |                 name = item | ||||||
|                 argres = '' |                 argres = b'' | ||||||
|  |             if (isinstance(tagpath,str)): | ||||||
|  |                 tagpath = tagpath.encode('utf-8') | ||||||
|             if name.endswith(tagpath) : |             if name.endswith(tagpath) : | ||||||
|                 result = argres |                 result = argres | ||||||
|                 foundat = j |                 foundat = j | ||||||
|  | @ -103,9 +105,9 @@ class PParser(object): | ||||||
|         cnt = len(self.flatdoc) |         cnt = len(self.flatdoc) | ||||||
|         for j in range(cnt): |         for j in range(cnt): | ||||||
|             item = self.flatdoc[j] |             item = self.flatdoc[j] | ||||||
|             if item.find('=') >= 0: |             if item.find(b'=') >= 0: | ||||||
|                 (name, argt) = item.split('=') |                 (name, argt) = item.split(b'=') | ||||||
|                 argres = argt.split('|') |                 argres = argt.split(b'|') | ||||||
|             else: |             else: | ||||||
|                 name = item |                 name = item | ||||||
|                 argres = [] |                 argres = [] | ||||||
|  | @ -120,15 +122,17 @@ class PParser(object): | ||||||
|     def getDataatPos(self, path, pos): |     def getDataatPos(self, path, pos): | ||||||
|         result = None |         result = None | ||||||
|         item = self.flatdoc[pos] |         item = self.flatdoc[pos] | ||||||
|         if item.find('=') >= 0: |         if item.find(b'=') >= 0: | ||||||
|             (name, argt) = item.split('=') |             (name, argt) = item.split(b'=') | ||||||
|             argres = argt.split('|') |             argres = argt.split(b'|') | ||||||
|         else: |         else: | ||||||
|             name = item |             name = item | ||||||
|             argres = [] |             argres = [] | ||||||
|         if (len(argres) > 0) : |         if (len(argres) > 0) : | ||||||
|             for j in range(0,len(argres)): |             for j in range(0,len(argres)): | ||||||
|                 argres[j] = int(argres[j]) |                 argres[j] = int(argres[j]) | ||||||
|  |         if (isinstance(path,str)): | ||||||
|  |             path = path.encode('utf-8') | ||||||
|         if (name.endswith(path)): |         if (name.endswith(path)): | ||||||
|             result = argres |             result = argres | ||||||
|         return result |         return result | ||||||
|  | @ -138,12 +142,14 @@ class PParser(object): | ||||||
|         cnt = len(self.temp) |         cnt = len(self.temp) | ||||||
|         for j in range(cnt): |         for j in range(cnt): | ||||||
|             item = self.temp[j] |             item = self.temp[j] | ||||||
|             if item.find('=') >= 0: |             if item.find(b'=') >= 0: | ||||||
|                 (name, argt) = item.split('=') |                 (name, argt) = item.split(b'=') | ||||||
|                 argres = argt.split('|') |                 argres = argt.split(b'|') | ||||||
|             else: |             else: | ||||||
|                 name = item |                 name = item | ||||||
|                 argres = [] |                 argres = [] | ||||||
|  |             if (isinstance(path,str)): | ||||||
|  |                 path = path.encode('utf-8') | ||||||
|             if (name.endswith(path)): |             if (name.endswith(path)): | ||||||
|                 result = argres |                 result = argres | ||||||
|                 self.temp.pop(j) |                 self.temp.pop(j) | ||||||
|  |  | ||||||
|  | @ -44,10 +44,10 @@ if inCalibre : | ||||||
|     from calibre_plugins.dedrm import flatxml2svg |     from calibre_plugins.dedrm import flatxml2svg | ||||||
|     from calibre_plugins.dedrm import stylexml2css |     from calibre_plugins.dedrm import stylexml2css | ||||||
| else : | else : | ||||||
|     from . import convert2xml |     import convert2xml | ||||||
|     from . import flatxml2html |     import flatxml2html | ||||||
|     from . import flatxml2svg |     import flatxml2svg | ||||||
|     from . import stylexml2css |     import stylexml2css | ||||||
| 
 | 
 | ||||||
| # global switch | # global switch | ||||||
| buildXML = False | buildXML = False | ||||||
|  | @ -117,10 +117,10 @@ class Dictionary(object): | ||||||
|             self.stable.append(self.escapestr(readString(self.fo))) |             self.stable.append(self.escapestr(readString(self.fo))) | ||||||
|         self.pos = 0 |         self.pos = 0 | ||||||
|     def escapestr(self, str): |     def escapestr(self, str): | ||||||
|         str = str.replace('&','&') |         str = str.replace(b'&',b'&') | ||||||
|         str = str.replace('<','<') |         str = str.replace(b'<',b'<') | ||||||
|         str = str.replace('>','>') |         str = str.replace(b'>',b'>') | ||||||
|         str = str.replace('=','=') |         str = str.replace(b'=',b'=') | ||||||
|         return str |         return str | ||||||
|     def lookup(self,val): |     def lookup(self,val): | ||||||
|         if ((val >= 0) and (val < self.size)) : |         if ((val >= 0) and (val < self.size)) : | ||||||
|  | @ -138,7 +138,7 @@ class Dictionary(object): | ||||||
| 
 | 
 | ||||||
| class PageDimParser(object): | class PageDimParser(object): | ||||||
|     def __init__(self, flatxml): |     def __init__(self, flatxml): | ||||||
|         self.flatdoc = flatxml.split('\n') |         self.flatdoc = flatxml.split(b'\n') | ||||||
|     # find tag if within pos to end inclusive |     # find tag if within pos to end inclusive | ||||||
|     def findinDoc(self, tagpath, pos, end) : |     def findinDoc(self, tagpath, pos, end) : | ||||||
|         result = None |         result = None | ||||||
|  | @ -151,8 +151,8 @@ class PageDimParser(object): | ||||||
|         foundat = -1 |         foundat = -1 | ||||||
|         for j in range(pos, end): |         for j in range(pos, end): | ||||||
|             item = docList[j] |             item = docList[j] | ||||||
|             if item.find('=') >= 0: |             if item.find(b'=') >= 0: | ||||||
|                 (name, argres) = item.split('=') |                 (name, argres) = item.split(b'=') | ||||||
|             else : |             else : | ||||||
|                 name = item |                 name = item | ||||||
|                 argres = '' |                 argres = '' | ||||||
|  | @ -162,8 +162,8 @@ class PageDimParser(object): | ||||||
|                 break |                 break | ||||||
|         return foundat, result |         return foundat, result | ||||||
|     def process(self): |     def process(self): | ||||||
|         (pos, sph) = self.findinDoc('page.h',0,-1) |         (pos, sph) = self.findinDoc(b'page.h',0,-1) | ||||||
|         (pos, spw) = self.findinDoc('page.w',0,-1) |         (pos, spw) = self.findinDoc(b'page.w',0,-1) | ||||||
|         if (sph == None): sph = '-1' |         if (sph == None): sph = '-1' | ||||||
|         if (spw == None): spw = '-1' |         if (spw == None): spw = '-1' | ||||||
|         return sph, spw |         return sph, spw | ||||||
|  | @ -176,21 +176,21 @@ def getPageDim(flatxml): | ||||||
| 
 | 
 | ||||||
| class GParser(object): | class GParser(object): | ||||||
|     def __init__(self, flatxml): |     def __init__(self, flatxml): | ||||||
|         self.flatdoc = flatxml.split('\n') |         self.flatdoc = flatxml.split(b'\n') | ||||||
|         self.dpi = 1440 |         self.dpi = 1440 | ||||||
|         self.gh = self.getData('info.glyph.h') |         self.gh = self.getData(b'info.glyph.h') | ||||||
|         self.gw = self.getData('info.glyph.w') |         self.gw = self.getData(b'info.glyph.w') | ||||||
|         self.guse = self.getData('info.glyph.use') |         self.guse = self.getData(b'info.glyph.use') | ||||||
|         if self.guse : |         if self.guse : | ||||||
|             self.count = len(self.guse) |             self.count = len(self.guse) | ||||||
|         else : |         else : | ||||||
|             self.count = 0 |             self.count = 0 | ||||||
|         self.gvtx = self.getData('info.glyph.vtx') |         self.gvtx = self.getData(b'info.glyph.vtx') | ||||||
|         self.glen = self.getData('info.glyph.len') |         self.glen = self.getData(b'info.glyph.len') | ||||||
|         self.gdpi = self.getData('info.glyph.dpi') |         self.gdpi = self.getData(b'info.glyph.dpi') | ||||||
|         self.vx = self.getData('info.vtx.x') |         self.vx = self.getData(b'info.vtx.x') | ||||||
|         self.vy = self.getData('info.vtx.y') |         self.vy = self.getData(b'info.vtx.y') | ||||||
|         self.vlen = self.getData('info.len.n') |         self.vlen = self.getData(b'info.len.n') | ||||||
|         if self.vlen : |         if self.vlen : | ||||||
|             self.glen.append(len(self.vlen)) |             self.glen.append(len(self.vlen)) | ||||||
|         elif self.glen: |         elif self.glen: | ||||||
|  | @ -204,9 +204,9 @@ class GParser(object): | ||||||
|         cnt = len(self.flatdoc) |         cnt = len(self.flatdoc) | ||||||
|         for j in range(cnt): |         for j in range(cnt): | ||||||
|             item = self.flatdoc[j] |             item = self.flatdoc[j] | ||||||
|             if item.find('=') >= 0: |             if item.find(b'=') >= 0: | ||||||
|                 (name, argt) = item.split('=') |                 (name, argt) = item.split(b'=') | ||||||
|                 argres = argt.split('|') |                 argres = argt.split(b'|') | ||||||
|             else: |             else: | ||||||
|                 name = item |                 name = item | ||||||
|                 argres = [] |                 argres = [] | ||||||
|  | @ -431,7 +431,7 @@ def generateBook(bookDir, raw, fixedimage): | ||||||
| 
 | 
 | ||||||
|     # now get the css info |     # now get the css info | ||||||
|     cssstr , classlst = stylexml2css.convert2CSS(flat_xml, fontsize, ph, pw) |     cssstr , classlst = stylexml2css.convert2CSS(flat_xml, fontsize, ph, pw) | ||||||
|     open(xname, 'wb').write(cssstr) |     open(xname, 'w').write(cssstr) | ||||||
|     if buildXML: |     if buildXML: | ||||||
|         xname = os.path.join(xmlDir, 'other0000.xml') |         xname = os.path.join(xmlDir, 'other0000.xml') | ||||||
|         open(xname, 'wb').write(convert2xml.getXML(dict, otherFile)) |         open(xname, 'wb').write(convert2xml.getXML(dict, otherFile)) | ||||||
|  | @ -525,7 +525,7 @@ def generateBook(bookDir, raw, fixedimage): | ||||||
|     hlst.append('</body>\n</html>\n') |     hlst.append('</body>\n</html>\n') | ||||||
|     htmlstr = "".join(hlst) |     htmlstr = "".join(hlst) | ||||||
|     hlst = None |     hlst = None | ||||||
|     open(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr) |     open(os.path.join(bookDir, htmlFileName), 'w').write(htmlstr) | ||||||
| 
 | 
 | ||||||
|     print(" ") |     print(" ") | ||||||
|     print('Extracting Table of Contents from Amazon OCR') |     print('Extracting Table of Contents from Amazon OCR') | ||||||
|  | @ -571,7 +571,7 @@ def generateBook(bookDir, raw, fixedimage): | ||||||
|     tlst.append('</body>\n') |     tlst.append('</body>\n') | ||||||
|     tlst.append('</html>\n') |     tlst.append('</html>\n') | ||||||
|     tochtml = "".join(tlst) |     tochtml = "".join(tlst) | ||||||
|     open(os.path.join(svgDir, 'toc.xhtml'), 'wb').write(tochtml) |     open(os.path.join(svgDir, 'toc.xhtml'), 'w').write(tochtml) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|     # now create index_svg.xhtml that points to all required files |     # now create index_svg.xhtml that points to all required files | ||||||
|  | @ -608,7 +608,7 @@ def generateBook(bookDir, raw, fixedimage): | ||||||
|         flst = [] |         flst = [] | ||||||
|         for page in pagelst: |         for page in pagelst: | ||||||
|             flst.append(xmllst[page]) |             flst.append(xmllst[page]) | ||||||
|         flat_svg = "".join(flst) |         flat_svg = b"".join(flst) | ||||||
|         flst=None |         flst=None | ||||||
|         svgxml = flatxml2svg.convert2SVG(gd, flat_svg, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi) |         svgxml = flatxml2svg.convert2SVG(gd, flat_svg, pageid, previd, nextid, svgDir, raw, meta_array, scaledpi) | ||||||
|         if (raw) : |         if (raw) : | ||||||
|  | @ -626,7 +626,7 @@ def generateBook(bookDir, raw, fixedimage): | ||||||
|     slst.append('</body>\n</html>\n') |     slst.append('</body>\n</html>\n') | ||||||
|     svgindex = "".join(slst) |     svgindex = "".join(slst) | ||||||
|     slst = None |     slst = None | ||||||
|     open(os.path.join(bookDir, 'index_svg.xhtml'), 'wb').write(svgindex) |     open(os.path.join(bookDir, 'index_svg.xhtml'), 'w').write(svgindex) | ||||||
| 
 | 
 | ||||||
|     print(" ") |     print(" ") | ||||||
| 
 | 
 | ||||||
|  | @ -637,16 +637,16 @@ def generateBook(bookDir, raw, fixedimage): | ||||||
|     olst.append('<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="guid_id">\n') |     olst.append('<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="guid_id">\n') | ||||||
|     # adding metadata |     # adding metadata | ||||||
|     olst.append('   <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">\n') |     olst.append('   <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">\n') | ||||||
|     if 'GUID' in meta_array: |     if b'GUID' in meta_array: | ||||||
|         olst.append('      <dc:identifier opf:scheme="GUID" id="guid_id">' + meta_array['GUID'] + '</dc:identifier>\n') |         olst.append('      <dc:identifier opf:scheme="GUID" id="guid_id">' + meta_array[b'GUID'].decode('utf-8') + '</dc:identifier>\n') | ||||||
|     if 'ASIN' in meta_array: |     if b'ASIN' in meta_array: | ||||||
|         olst.append('      <dc:identifier opf:scheme="ASIN">' + meta_array['ASIN'] + '</dc:identifier>\n') |         olst.append('      <dc:identifier opf:scheme="ASIN">' + meta_array[b'ASIN'].decode('utf-8') + '</dc:identifier>\n') | ||||||
|     if 'oASIN' in meta_array: |     if b'oASIN' in meta_array: | ||||||
|         olst.append('      <dc:identifier opf:scheme="oASIN">' + meta_array['oASIN'] + '</dc:identifier>\n') |         olst.append('      <dc:identifier opf:scheme="oASIN">' + meta_array[b'oASIN'].decode('utf-8') + '</dc:identifier>\n') | ||||||
|     olst.append('      <dc:title>' + meta_array['Title'] + '</dc:title>\n') |     olst.append('      <dc:title>' + meta_array[b'Title'].decode('utf-8') + '</dc:title>\n') | ||||||
|     olst.append('      <dc:creator opf:role="aut">' + meta_array['Authors'] + '</dc:creator>\n') |     olst.append('      <dc:creator opf:role="aut">' + meta_array[b'Authors'].decode('utf-8') + '</dc:creator>\n') | ||||||
|     olst.append('      <dc:language>en</dc:language>\n') |     olst.append('      <dc:language>en</dc:language>\n') | ||||||
|     olst.append('      <dc:date>' + meta_array['UpdateTime'] + '</dc:date>\n') |     olst.append('      <dc:date>' + meta_array[b'UpdateTime'].decode('utf-8') + '</dc:date>\n') | ||||||
|     if isCover: |     if isCover: | ||||||
|         olst.append('      <meta name="cover" content="bookcover"/>\n') |         olst.append('      <meta name="cover" content="bookcover"/>\n') | ||||||
|     olst.append('   </metadata>\n') |     olst.append('   </metadata>\n') | ||||||
|  | @ -675,7 +675,7 @@ def generateBook(bookDir, raw, fixedimage): | ||||||
|     olst.append('</package>\n') |     olst.append('</package>\n') | ||||||
|     opfstr = "".join(olst) |     opfstr = "".join(olst) | ||||||
|     olst = None |     olst = None | ||||||
|     open(opfname, 'wb').write(opfstr) |     open(opfname, 'w').write(opfstr) | ||||||
| 
 | 
 | ||||||
|     print('Processing Complete') |     print('Processing Complete') | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -49,14 +49,15 @@ def SHA1(message): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # Encode the bytes in data with the characters in map | # Encode the bytes in data with the characters in map | ||||||
|  | # data and map should be byte arrays | ||||||
| def encode(data, map): | def encode(data, map): | ||||||
|     result = '' |     result = b'' | ||||||
|     for char in data: |     for char in data: | ||||||
|         value = ord(char) |         value = char | ||||||
|         Q = (value ^ 0x80) // len(map) |         Q = (value ^ 0x80) // len(map) | ||||||
|         R = value % len(map) |         R = value % len(map) | ||||||
|         result += map[Q] |         result += bytes([map[Q]]) | ||||||
|         result += map[R] |         result += bytes([map[R]]) | ||||||
|     return result |     return result | ||||||
| 
 | 
 | ||||||
| # Hash the bytes in data and then encode the digest with the characters in map | # Hash the bytes in data and then encode the digest with the characters in map | ||||||
|  | @ -117,7 +118,7 @@ def generatePidEncryptionTable() : | ||||||
| def generatePidSeed(table,dsn) : | def generatePidSeed(table,dsn) : | ||||||
|     value = 0 |     value = 0 | ||||||
|     for counter in range (0,4) : |     for counter in range (0,4) : | ||||||
|         index = (ord(dsn[counter]) ^ value) &0xFF |         index = (dsn[counter] ^ value) & 0xFF | ||||||
|         value = (value >> 8) ^ table[index] |         value = (value >> 8) ^ table[index] | ||||||
|     return value |     return value | ||||||
| 
 | 
 | ||||||
|  | @ -129,7 +130,7 @@ def generateDevicePID(table,dsn,nbRoll): | ||||||
|     pid = [(seed >>24) &0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF,(seed>>24) & 0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF] |     pid = [(seed >>24) &0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF,(seed>>24) & 0xFF,(seed >> 16) &0xff,(seed >> 8) &0xFF,(seed) & 0xFF] | ||||||
|     index = 0 |     index = 0 | ||||||
|     for counter in range (0,nbRoll): |     for counter in range (0,nbRoll): | ||||||
|         pid[index] = pid[index] ^ ord(dsn[counter]) |         pid[index] = pid[index] ^ dsn[counter] | ||||||
|         index = (index+1) %8 |         index = (index+1) %8 | ||||||
|     for counter in range (0,8): |     for counter in range (0,8): | ||||||
|         index = ((((pid[counter] >>5) & 3) ^ pid[counter]) & 0x1f) + (pid[counter] >> 7) |         index = ((((pid[counter] >>5) & 3) ^ pid[counter]) & 0x1f) + (pid[counter] >> 7) | ||||||
|  | @ -205,7 +206,7 @@ def getK4Pids(rec209, token, kindleDatabase): | ||||||
| 
 | 
 | ||||||
|     try: |     try: | ||||||
|         # Get the kindle account token, if present |         # Get the kindle account token, if present | ||||||
|         kindleAccountToken = bytearray.fromhex((kindleDatabase[1])[b'kindle.account.tokens']).decode() |         kindleAccountToken = bytearray.fromhex((kindleDatabase[1])['kindle.account.tokens']) | ||||||
| 
 | 
 | ||||||
|     except KeyError: |     except KeyError: | ||||||
|         kindleAccountToken="" |         kindleAccountToken="" | ||||||
|  | @ -213,30 +214,30 @@ def getK4Pids(rec209, token, kindleDatabase): | ||||||
| 
 | 
 | ||||||
|     try: |     try: | ||||||
|         # Get the DSN token, if present |         # Get the DSN token, if present | ||||||
|         DSN = bytearray.fromhex((kindleDatabase[1])['DSN']).decode() |         DSN = bytearray.fromhex((kindleDatabase[1])['DSN']) | ||||||
|         print("Got DSN key from database {0}".format(kindleDatabase[0])) |         print("Got DSN key from database {0}".format(kindleDatabase[0])) | ||||||
|     except KeyError: |     except KeyError: | ||||||
|         # See if we have the info to generate the DSN |         # See if we have the info to generate the DSN | ||||||
|         try: |         try: | ||||||
|             # Get the Mazama Random number |             # Get the Mazama Random number | ||||||
|             MazamaRandomNumber = bytearray.fromhex((kindleDatabase[1])[b'MazamaRandomNumber']).decode() |             MazamaRandomNumber = bytearray.fromhex((kindleDatabase[1])['MazamaRandomNumber']) | ||||||
|             #print "Got MazamaRandomNumber from database {0}".format(kindleDatabase[0]) |             #print "Got MazamaRandomNumber from database {0}".format(kindleDatabase[0]) | ||||||
| 
 | 
 | ||||||
|             try: |             try: | ||||||
|                 # Get the SerialNumber token, if present |                 # Get the SerialNumber token, if present | ||||||
|                 IDString = bytearray.fromhex((kindleDatabase[1])[b'SerialNumber']).decode() |                 IDString = bytearray.fromhex((kindleDatabase[1])['SerialNumber']) | ||||||
|                 print("Got SerialNumber from database {0}".format(kindleDatabase[0])) |                 print("Got SerialNumber from database {0}".format(kindleDatabase[0])) | ||||||
|             except KeyError: |             except KeyError: | ||||||
|                  # Get the IDString we added |                  # Get the IDString we added | ||||||
|                 IDString = bytearray.fromhex((kindleDatabase[1])[b'IDString']).decode() |                 IDString = bytearray.fromhex((kindleDatabase[1])['IDString']) | ||||||
| 
 | 
 | ||||||
|             try: |             try: | ||||||
|                 # Get the UsernameHash token, if present |                 # Get the UsernameHash token, if present | ||||||
|                 encodedUsername = bytearray.fromhex((kindleDatabase[1])[b'UsernameHash']).decode() |                 encodedUsername = bytearray.fromhex((kindleDatabase[1])['UsernameHash']) | ||||||
|                 print("Got UsernameHash from database {0}".format(kindleDatabase[0])) |                 print("Got UsernameHash from database {0}".format(kindleDatabase[0])) | ||||||
|             except KeyError: |             except KeyError: | ||||||
|                 # Get the UserName we added |                 # Get the UserName we added | ||||||
|                 UserName = bytearray.fromhex((kindleDatabase[1])[b'UserName']).decode() |                 UserName = bytearray.fromhex((kindleDatabase[1])['UserName']) | ||||||
|                 # encode it |                 # encode it | ||||||
|                 encodedUsername = encodeHash(UserName,charMap1) |                 encodedUsername = encodeHash(UserName,charMap1) | ||||||
|                 #print "encodedUsername",encodedUsername.encode('hex') |                 #print "encodedUsername",encodedUsername.encode('hex') | ||||||
|  | @ -266,19 +267,19 @@ def getK4Pids(rec209, token, kindleDatabase): | ||||||
|     # Compute book PIDs |     # Compute book PIDs | ||||||
| 
 | 
 | ||||||
|     # book pid |     # book pid | ||||||
|     pidHash = SHA1(DSN.encode()+kindleAccountToken.encode()+rec209+token) |     pidHash = SHA1(DSN+kindleAccountToken+rec209+token) | ||||||
|     bookPID = encodePID(pidHash) |     bookPID = encodePID(pidHash) | ||||||
|     bookPID = checksumPid(bookPID) |     bookPID = checksumPid(bookPID) | ||||||
|     pids.append(bookPID) |     pids.append(bookPID) | ||||||
| 
 | 
 | ||||||
|     # variant 1 |     # variant 1 | ||||||
|     pidHash = SHA1(kindleAccountToken.encode()+rec209+token) |     pidHash = SHA1(kindleAccountToken+rec209+token) | ||||||
|     bookPID = encodePID(pidHash) |     bookPID = encodePID(pidHash) | ||||||
|     bookPID = checksumPid(bookPID) |     bookPID = checksumPid(bookPID) | ||||||
|     pids.append(bookPID) |     pids.append(bookPID) | ||||||
| 
 | 
 | ||||||
|     # variant 2 |     # variant 2 | ||||||
|     pidHash = SHA1(DSN.encode()+rec209+token) |     pidHash = SHA1(DSN+rec209+token) | ||||||
|     bookPID = encodePID(pidHash) |     bookPID = encodePID(pidHash) | ||||||
|     bookPID = checksumPid(bookPID) |     bookPID = checksumPid(bookPID) | ||||||
|     pids.append(bookPID) |     pids.append(bookPID) | ||||||
|  |  | ||||||
|  | @ -7,7 +7,7 @@ | ||||||
| 
 | 
 | ||||||
| from __future__ import print_function | from __future__ import print_function | ||||||
| __license__ = 'GPL v3' | __license__ = 'GPL v3' | ||||||
| __version__ = "1.00" | __version__ = "1.0" | ||||||
| 
 | 
 | ||||||
| # This is a python script. You need a Python interpreter to run it. | # This is a python script. You need a Python interpreter to run it. | ||||||
| # For example, ActiveState Python, which exists for windows. | # For example, ActiveState Python, which exists for windows. | ||||||
|  | @ -73,7 +73,7 @@ __version__ = "1.00" | ||||||
| #  0.40 - moved unicode_argv call inside main for Windows DeDRM compatibility | #  0.40 - moved unicode_argv call inside main for Windows DeDRM compatibility | ||||||
| #  0.41 - Fixed potential unicode problem in command line calls | #  0.41 - Fixed potential unicode problem in command line calls | ||||||
| #  0.42 - Added GPL v3 licence. updated/removed some print statements | #  0.42 - Added GPL v3 licence. updated/removed some print statements | ||||||
| #  1.00 - Python 3 compatibility for calibre 5.0 | #  1.0  - Python 3 compatibility for calibre 5.0 | ||||||
| 
 | 
 | ||||||
| import sys | import sys | ||||||
| import os | import os | ||||||
|  | @ -330,7 +330,7 @@ class MobiBook: | ||||||
|         } |         } | ||||||
|         title = '' |         title = '' | ||||||
|         codec = 'windows-1252' |         codec = 'windows-1252' | ||||||
|         if self.magic == 'BOOKMOBI': |         if self.magic == b'BOOKMOBI': | ||||||
|             if 503 in self.meta_array: |             if 503 in self.meta_array: | ||||||
|                 title = self.meta_array[503] |                 title = self.meta_array[503] | ||||||
|             else: |             else: | ||||||
|  |  | ||||||
|  | @ -15,36 +15,36 @@ debug = False | ||||||
| 
 | 
 | ||||||
| class DocParser(object): | class DocParser(object): | ||||||
|     def __init__(self, flatxml, fontsize, ph, pw): |     def __init__(self, flatxml, fontsize, ph, pw): | ||||||
|         self.flatdoc = flatxml.split('\n') |         self.flatdoc = flatxml.split(b'\n') | ||||||
|         self.fontsize = int(fontsize) |         self.fontsize = int(fontsize) | ||||||
|         self.ph = int(ph) * 1.0 |         self.ph = int(ph) * 1.0 | ||||||
|         self.pw = int(pw) * 1.0 |         self.pw = int(pw) * 1.0 | ||||||
| 
 | 
 | ||||||
|     stags = { |     stags = { | ||||||
|         'paragraph' : 'p', |         b'paragraph' : 'p', | ||||||
|         'graphic'   : '.graphic' |         b'graphic'   : '.graphic' | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     attr_val_map = { |     attr_val_map = { | ||||||
|         'hang'            : 'text-indent: ', |         b'hang'            : 'text-indent: ', | ||||||
|         'indent'          : 'text-indent: ', |         b'indent'          : 'text-indent: ', | ||||||
|         'line-space'      : 'line-height: ', |         b'line-space'      : 'line-height: ', | ||||||
|         'margin-bottom'   : 'margin-bottom: ', |         b'margin-bottom'   : 'margin-bottom: ', | ||||||
|         'margin-left'     : 'margin-left: ', |         b'margin-left'     : 'margin-left: ', | ||||||
|         'margin-right'    : 'margin-right: ', |         b'margin-right'    : 'margin-right: ', | ||||||
|         'margin-top'      : 'margin-top: ', |         b'margin-top'      : 'margin-top: ', | ||||||
|         'space-after'     : 'padding-bottom: ', |         b'space-after'     : 'padding-bottom: ', | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     attr_str_map = { |     attr_str_map = { | ||||||
|         'align-center' : 'text-align: center; margin-left: auto; margin-right: auto;', |         b'align-center' : 'text-align: center; margin-left: auto; margin-right: auto;', | ||||||
|         'align-left'   : 'text-align: left;', |         b'align-left'   : 'text-align: left;', | ||||||
|         'align-right'  : 'text-align: right;', |         b'align-right'  : 'text-align: right;', | ||||||
|         'align-justify' : 'text-align: justify;', |         b'align-justify' : 'text-align: justify;', | ||||||
|         'display-inline' : 'display: inline;', |         b'display-inline' : 'display: inline;', | ||||||
|         'pos-left' : 'text-align: left;', |         b'pos-left' : 'text-align: left;', | ||||||
|         'pos-right' : 'text-align: right;', |         b'pos-right' : 'text-align: right;', | ||||||
|         'pos-center' : 'text-align: center; margin-left: auto; margin-right: auto;', |         b'pos-center' : 'text-align: center; margin-left: auto; margin-right: auto;', | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -60,11 +60,13 @@ class DocParser(object): | ||||||
|         foundat = -1 |         foundat = -1 | ||||||
|         for j in range(pos, end): |         for j in range(pos, end): | ||||||
|             item = docList[j] |             item = docList[j] | ||||||
|             if item.find('=') >= 0: |             if item.find(b'=') >= 0: | ||||||
|                 (name, argres) = item.split('=',1) |                 (name, argres) = item.split(b'=',1) | ||||||
|             else : |             else : | ||||||
|                 name = item |                 name = item | ||||||
|                 argres = '' |                 argres = b'' | ||||||
|  |             if (isinstance(tagpath,str)): | ||||||
|  |                 tagpath = tagpath.encode('utf-8') | ||||||
|             if name.endswith(tagpath) : |             if name.endswith(tagpath) : | ||||||
|                 result = argres |                 result = argres | ||||||
|                 foundat = j |                 foundat = j | ||||||
|  | @ -76,7 +78,7 @@ class DocParser(object): | ||||||
|     def posinDoc(self, tagpath): |     def posinDoc(self, tagpath): | ||||||
|         startpos = [] |         startpos = [] | ||||||
|         pos = 0 |         pos = 0 | ||||||
|         res = "" |         res = b"" | ||||||
|         while res != None : |         while res != None : | ||||||
|             (foundpos, res) = self.findinDoc(tagpath, pos, -1) |             (foundpos, res) = self.findinDoc(tagpath, pos, -1) | ||||||
|             if res != None : |             if res != None : | ||||||
|  | @ -87,11 +89,11 @@ class DocParser(object): | ||||||
|     # returns a vector of integers for the tagpath |     # returns a vector of integers for the tagpath | ||||||
|     def getData(self, tagpath, pos, end, clean=False): |     def getData(self, tagpath, pos, end, clean=False): | ||||||
|         if clean: |         if clean: | ||||||
|             digits_only = re.compile(r'''([0-9]+)''') |             digits_only = re.compile(rb'''([0-9]+)''') | ||||||
|         argres=[] |         argres=[] | ||||||
|         (foundat, argt) = self.findinDoc(tagpath, pos, end) |         (foundat, argt) = self.findinDoc(tagpath, pos, end) | ||||||
|         if (argt != None) and (len(argt) > 0) : |         if (argt != None) and (len(argt) > 0) : | ||||||
|             argList = argt.split('|') |             argList = argt.split(b'|') | ||||||
|             for strval in argList: |             for strval in argList: | ||||||
|                 if clean: |                 if clean: | ||||||
|                     m = re.search(digits_only, strval) |                     m = re.search(digits_only, strval) | ||||||
|  | @ -109,7 +111,7 @@ class DocParser(object): | ||||||
|         csspage += '.cl-justify { text-align: justify; }\n' |         csspage += '.cl-justify { text-align: justify; }\n' | ||||||
| 
 | 
 | ||||||
|         # generate a list of each <style> starting point in the stylesheet |         # generate a list of each <style> starting point in the stylesheet | ||||||
|         styleList= self.posinDoc('book.stylesheet.style') |         styleList= self.posinDoc(b'book.stylesheet.style') | ||||||
|         stylecnt = len(styleList) |         stylecnt = len(styleList) | ||||||
|         styleList.append(-1) |         styleList.append(-1) | ||||||
| 
 | 
 | ||||||
|  | @ -121,30 +123,30 @@ class DocParser(object): | ||||||
|             start = styleList[j] |             start = styleList[j] | ||||||
|             end = styleList[j+1] |             end = styleList[j+1] | ||||||
| 
 | 
 | ||||||
|             (pos, tag) = self.findinDoc('style._tag',start,end) |             (pos, tag) = self.findinDoc(b'style._tag',start,end) | ||||||
|             if tag == None : |             if tag == None : | ||||||
|                 (pos, tag) = self.findinDoc('style.type',start,end) |                 (pos, tag) = self.findinDoc(b'style.type',start,end) | ||||||
| 
 | 
 | ||||||
|             # Is this something we know how to convert to css |             # Is this something we know how to convert to css | ||||||
|             if tag in self.stags : |             if tag in self.stags : | ||||||
| 
 | 
 | ||||||
|                 # get the style class |                 # get the style class | ||||||
|                 (pos, sclass) = self.findinDoc('style.class',start,end) |                 (pos, sclass) = self.findinDoc(b'style.class',start,end) | ||||||
|                 if sclass != None: |                 if sclass != None: | ||||||
|                     sclass = sclass.replace(' ','-') |                     sclass = sclass.replace(b' ',b'-') | ||||||
|                     sclass = '.cl-' + sclass.lower() |                     sclass = b'.cl-' + sclass.lower() | ||||||
|                 else : |                 else : | ||||||
|                     sclass = '' |                     sclass = b'' | ||||||
| 
 | 
 | ||||||
|                 if debug: print('sclass', sclass) |                 if debug: print('sclass', sclass) | ||||||
| 
 | 
 | ||||||
|                 # check for any "after class" specifiers |                 # check for any "after class" specifiers | ||||||
|                 (pos, aftclass) = self.findinDoc('style._after_class',start,end) |                 (pos, aftclass) = self.findinDoc(b'style._after_class',start,end) | ||||||
|                 if aftclass != None: |                 if aftclass != None: | ||||||
|                     aftclass = aftclass.replace(' ','-') |                     aftclass = aftclass.replace(b' ',b'-') | ||||||
|                     aftclass = '.cl-' + aftclass.lower() |                     aftclass = b'.cl-' + aftclass.lower() | ||||||
|                 else : |                 else : | ||||||
|                     aftclass = '' |                     aftclass = b'' | ||||||
| 
 | 
 | ||||||
|                 if debug: print('aftclass', aftclass) |                 if debug: print('aftclass', aftclass) | ||||||
| 
 | 
 | ||||||
|  | @ -152,34 +154,37 @@ class DocParser(object): | ||||||
| 
 | 
 | ||||||
|                 while True : |                 while True : | ||||||
| 
 | 
 | ||||||
|                     (pos1, attr) = self.findinDoc('style.rule.attr', start, end) |                     (pos1, attr) = self.findinDoc(b'style.rule.attr', start, end) | ||||||
|                     (pos2, val) = self.findinDoc('style.rule.value', start, end) |                     (pos2, val) = self.findinDoc(b'style.rule.value', start, end) | ||||||
| 
 | 
 | ||||||
|                     if debug: print('attr', attr) |                     if debug: print('attr', attr) | ||||||
|                     if debug: print('val', val) |                     if debug: print('val', val) | ||||||
| 
 | 
 | ||||||
|                     if attr == None : break |                     if attr == None : break | ||||||
| 
 | 
 | ||||||
|                     if (attr == 'display') or (attr == 'pos') or (attr == 'align'): |                     if (attr == b'display') or (attr == b'pos') or (attr == b'align'): | ||||||
|                         # handle text based attributess |                         # handle text based attributess | ||||||
|                         attr = attr + '-' + val |                         attr = attr + b'-' + val | ||||||
|                         if attr in self.attr_str_map : |                         if attr in self.attr_str_map : | ||||||
|                             cssargs[attr] = (self.attr_str_map[attr], '') |                             cssargs[attr] = (self.attr_str_map[attr], b'') | ||||||
|                     else : |                     else : | ||||||
|                         # handle value based attributes |                         # handle value based attributes | ||||||
|                         if attr in self.attr_val_map : |                         if attr in self.attr_val_map : | ||||||
|                             name = self.attr_val_map[attr] |                             name = self.attr_val_map[attr] | ||||||
|                             if attr in ('margin-bottom', 'margin-top', 'space-after') : |                             if attr in (b'margin-bottom', b'margin-top', b'space-after') : | ||||||
|                                 scale = self.ph |                                 scale = self.ph | ||||||
|                             elif attr in ('margin-right', 'indent', 'margin-left', 'hang') : |                             elif attr in (b'margin-right', b'indent', b'margin-left', b'hang') : | ||||||
|                                 scale = self.pw |                                 scale = self.pw | ||||||
|                             elif attr == 'line-space': |                             elif attr == b'line-space': | ||||||
|                                 scale = self.fontsize * 2.0 |                                 scale = self.fontsize * 2.0 | ||||||
|  |                             else: | ||||||
|  |                                 print("Scale not defined!") | ||||||
|  |                                 scale = 1.0 | ||||||
| 
 | 
 | ||||||
|                             if val == "": |                             if val == "": | ||||||
|                                 val = 0 |                                 val = 0 | ||||||
| 
 | 
 | ||||||
|                             if not ((attr == 'hang') and (int(val) == 0)): |                             if not ((attr == b'hang') and (int(val) == 0)): | ||||||
|                                 try: |                                 try: | ||||||
|                                     f = float(val) |                                     f = float(val) | ||||||
|                                 except: |                                 except: | ||||||
|  | @ -198,32 +203,32 @@ class DocParser(object): | ||||||
|                     if debug: print('keeping style') |                     if debug: print('keeping style') | ||||||
|                     # make sure line-space does not go below 100% or above 300% since |                     # make sure line-space does not go below 100% or above 300% since | ||||||
|                     # it can be wacky in some styles |                     # it can be wacky in some styles | ||||||
|                     if 'line-space' in cssargs: |                     if b'line-space' in cssargs: | ||||||
|                         seg = cssargs['line-space'][0] |                         seg = cssargs[b'line-space'][0] | ||||||
|                         val = cssargs['line-space'][1] |                         val = cssargs[b'line-space'][1] | ||||||
|                         if val < 1.0: val = 1.0 |                         if val < 1.0: val = 1.0 | ||||||
|                         if val > 3.0: val = 3.0 |                         if val > 3.0: val = 3.0 | ||||||
|                         del cssargs['line-space'] |                         del cssargs[b'line-space'] | ||||||
|                         cssargs['line-space'] = (self.attr_val_map['line-space'], val) |                         cssargs[b'line-space'] = (self.attr_val_map[b'line-space'], val) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|                     # handle modifications for css style hanging indents |                     # handle modifications for css style hanging indents | ||||||
|                     if 'hang' in cssargs: |                     if b'hang' in cssargs: | ||||||
|                         hseg = cssargs['hang'][0] |                         hseg = cssargs[b'hang'][0] | ||||||
|                         hval = cssargs['hang'][1] |                         hval = cssargs[b'hang'][1] | ||||||
|                         del cssargs['hang'] |                         del cssargs[b'hang'] | ||||||
|                         cssargs['hang'] = (self.attr_val_map['hang'], -hval) |                         cssargs[b'hang'] = (self.attr_val_map[b'hang'], -hval) | ||||||
|                         mval = 0 |                         mval = 0 | ||||||
|                         mseg = 'margin-left: ' |                         mseg = 'margin-left: ' | ||||||
|                         mval = hval |                         mval = hval | ||||||
|                         if 'margin-left' in cssargs: |                         if b'margin-left' in cssargs: | ||||||
|                             mseg = cssargs['margin-left'][0] |                             mseg = cssargs[b'margin-left'][0] | ||||||
|                             mval = cssargs['margin-left'][1] |                             mval = cssargs[b'margin-left'][1] | ||||||
|                             if mval < 0: mval = 0 |                             if mval < 0: mval = 0 | ||||||
|                             mval = hval + mval |                             mval = hval + mval | ||||||
|                         cssargs['margin-left'] = (mseg, mval) |                         cssargs[b'margin-left'] = (mseg, mval) | ||||||
|                         if 'indent' in cssargs: |                         if b'indent' in cssargs: | ||||||
|                             del cssargs['indent'] |                             del cssargs[b'indent'] | ||||||
| 
 | 
 | ||||||
|                     cssline = sclass + ' { ' |                     cssline = sclass + ' { ' | ||||||
|                     for key in iter(cssargs): |                     for key in iter(cssargs): | ||||||
|  |  | ||||||
|  | @ -173,7 +173,7 @@ def decryptRecord(data,PID): | ||||||
| def decryptDkeyRecord(data,PID): | def decryptDkeyRecord(data,PID): | ||||||
|     record = decryptRecord(data,PID) |     record = decryptRecord(data,PID) | ||||||
|     fields = unpack('3sB8sB8s3s',record) |     fields = unpack('3sB8sB8s3s',record) | ||||||
|     if fields[0] != 'PID' or fields[5] != 'pid' : |     if fields[0] != b'PID' or fields[5] != b'pid' : | ||||||
|         raise DrmException("Didn't find PID magic numbers in record") |         raise DrmException("Didn't find PID magic numbers in record") | ||||||
|     elif fields[1] != 8 or fields[3] != 8 : |     elif fields[1] != 8 or fields[3] != 8 : | ||||||
|         raise DrmException("Record didn't contain correct length fields") |         raise DrmException("Record didn't contain correct length fields") | ||||||
|  | @ -183,11 +183,11 @@ def decryptDkeyRecord(data,PID): | ||||||
| 
 | 
 | ||||||
| # Decrypt all dkey records (contain the book PID) | # Decrypt all dkey records (contain the book PID) | ||||||
| def decryptDkeyRecords(data,PID): | def decryptDkeyRecords(data,PID): | ||||||
|     nbKeyRecords = ord(data[0]) |     nbKeyRecords = data[0] | ||||||
|     records = [] |     records = [] | ||||||
|     data = data[1:] |     data = data[1:] | ||||||
|     for i in range (0,nbKeyRecords): |     for i in range (0,nbKeyRecords): | ||||||
|         length = ord(data[0]) |         length = data[0] | ||||||
|         try: |         try: | ||||||
|             key = decryptDkeyRecord(data[1:length+1],PID) |             key = decryptDkeyRecord(data[1:length+1],PID) | ||||||
|             records.append(key) |             records.append(key) | ||||||
|  | @ -209,7 +209,7 @@ class TopazBook: | ||||||
|         self.bookMetadata = {} |         self.bookMetadata = {} | ||||||
|         self.bookKey = None |         self.bookKey = None | ||||||
|         magic = unpack('4s',self.fo.read(4))[0] |         magic = unpack('4s',self.fo.read(4))[0] | ||||||
|         if magic != 'TPZ0': |         if magic != b'TPZ0': | ||||||
|             raise DrmException("Parse Error : Invalid Header, not a Topaz file") |             raise DrmException("Parse Error : Invalid Header, not a Topaz file") | ||||||
|         self.parseTopazHeaders() |         self.parseTopazHeaders() | ||||||
|         self.parseMetadata() |         self.parseMetadata() | ||||||
|  | @ -244,9 +244,9 @@ class TopazBook: | ||||||
| 
 | 
 | ||||||
|     def parseMetadata(self): |     def parseMetadata(self): | ||||||
|         # Parse the metadata record from the book payload and return a list of [key,values] |         # Parse the metadata record from the book payload and return a list of [key,values] | ||||||
|         self.fo.seek(self.bookPayloadOffset + self.bookHeaderRecords['metadata'][0][0]) |         self.fo.seek(self.bookPayloadOffset + self.bookHeaderRecords[b'metadata'][0][0]) | ||||||
|         tag = bookReadString(self.fo) |         tag = bookReadString(self.fo) | ||||||
|         if tag != 'metadata' : |         if tag != b'metadata' : | ||||||
|             raise DrmException("Parse Error : Record Names Don't Match") |             raise DrmException("Parse Error : Record Names Don't Match") | ||||||
|         flags = ord(self.fo.read(1)) |         flags = ord(self.fo.read(1)) | ||||||
|         nbRecords = ord(self.fo.read(1)) |         nbRecords = ord(self.fo.read(1)) | ||||||
|  | @ -260,18 +260,18 @@ class TopazBook: | ||||||
|         return self.bookMetadata |         return self.bookMetadata | ||||||
| 
 | 
 | ||||||
|     def getPIDMetaInfo(self): |     def getPIDMetaInfo(self): | ||||||
|         keysRecord = self.bookMetadata.get('keys','') |         keysRecord = self.bookMetadata.get(b'keys',b'') | ||||||
|         keysRecordRecord = '' |         keysRecordRecord = b'' | ||||||
|         if keysRecord != '': |         if keysRecord != b'': | ||||||
|             keylst = keysRecord.split(',') |             keylst = keysRecord.split(b',') | ||||||
|             for keyval in keylst: |             for keyval in keylst: | ||||||
|                 keysRecordRecord += self.bookMetadata.get(keyval,'') |                 keysRecordRecord += self.bookMetadata.get(keyval,b'') | ||||||
|         return keysRecord, keysRecordRecord |         return keysRecord, keysRecordRecord | ||||||
| 
 | 
 | ||||||
|     def getBookTitle(self): |     def getBookTitle(self): | ||||||
|         title = '' |         title = b'' | ||||||
|         if 'Title' in self.bookMetadata: |         if b'Title' in self.bookMetadata: | ||||||
|             title = self.bookMetadata['Title'] |             title = self.bookMetadata[b'Title'] | ||||||
|         return title.decode('utf-8') |         return title.decode('utf-8') | ||||||
| 
 | 
 | ||||||
|     def setBookKey(self, key): |     def setBookKey(self, key): | ||||||
|  | @ -323,7 +323,7 @@ class TopazBook: | ||||||
|         raw = 0 |         raw = 0 | ||||||
|         fixedimage=True |         fixedimage=True | ||||||
|         try: |         try: | ||||||
|             keydata = self.getBookPayloadRecord('dkey', 0) |             keydata = self.getBookPayloadRecord(b'dkey', 0) | ||||||
|         except DrmException as e: |         except DrmException as e: | ||||||
|             print("no dkey record found, book may not be encrypted") |             print("no dkey record found, book may not be encrypted") | ||||||
|             print("attempting to extrct files without a book key") |             print("attempting to extrct files without a book key") | ||||||
|  | @ -354,7 +354,7 @@ class TopazBook: | ||||||
|                 pass |                 pass | ||||||
|             else: |             else: | ||||||
|                 bookKey = bookKeys[0] |                 bookKey = bookKeys[0] | ||||||
|                 print("Book Key Found! ({0})".format(bookKey.encode('hex'))) |                 print("Book Key Found! ({0})".format(bookKey.hex())) | ||||||
|                 break |                 break | ||||||
| 
 | 
 | ||||||
|         if not bookKey: |         if not bookKey: | ||||||
|  | @ -396,26 +396,26 @@ class TopazBook: | ||||||
|         outdir = self.outdir |         outdir = self.outdir | ||||||
|         for headerRecord in self.bookHeaderRecords: |         for headerRecord in self.bookHeaderRecords: | ||||||
|             name = headerRecord |             name = headerRecord | ||||||
|             if name != 'dkey': |             if name != b'dkey': | ||||||
|                 ext = ".dat" |                 ext = ".dat" | ||||||
|                 if name == 'img': ext = ".jpg" |                 if name == b'img': ext = ".jpg" | ||||||
|                 if name == 'color' : ext = ".jpg" |                 if name == b'color' : ext = ".jpg" | ||||||
|                 print("Processing Section: {0}\n. . .".format(name), end=' ') |                 print("Processing Section: {0}\n. . .".format(name.decode('utf-8')), end=' ') | ||||||
|                 for index in range (0,len(self.bookHeaderRecords[name])) : |                 for index in range (0,len(self.bookHeaderRecords[name])) : | ||||||
|                     fname = "{0}{1:04d}{2}".format(name,index,ext) |                     fname = "{0}{1:04d}{2}".format(name.decode('utf-8'),index,ext) | ||||||
|                     destdir = outdir |                     destdir = outdir | ||||||
|                     if name == 'img': |                     if name == b'img': | ||||||
|                         destdir =  os.path.join(outdir,"img") |                         destdir =  os.path.join(outdir,"img") | ||||||
|                     if name == 'color': |                     if name == b'color': | ||||||
|                         destdir =  os.path.join(outdir,"color_img") |                         destdir =  os.path.join(outdir,"color_img") | ||||||
|                     if name == 'page': |                     if name == b'page': | ||||||
|                         destdir =  os.path.join(outdir,"page") |                         destdir =  os.path.join(outdir,"page") | ||||||
|                     if name == 'glyphs': |                     if name == b'glyphs': | ||||||
|                         destdir =  os.path.join(outdir,"glyphs") |                         destdir =  os.path.join(outdir,"glyphs") | ||||||
|                     outputFile = os.path.join(destdir,fname) |                     outputFile = os.path.join(destdir,fname) | ||||||
|                     print(".", end=' ') |                     print(".", end=' ') | ||||||
|                     record = self.getBookPayloadRecord(name,index) |                     record = self.getBookPayloadRecord(name,index) | ||||||
|                     if record != '': |                     if record != b'': | ||||||
|                         open(outputFile, 'wb').write(record) |                         open(outputFile, 'wb').write(record) | ||||||
|                 print(" ") |                 print(" ") | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Apprentice Harper
						Apprentice Harper