The attached patches fix two issues with a file created by "Microsoft
Office Word 2007"
If an xref started "0 0\ntrailer", pdfsizeopt would raise an exception
around line 3096 instead of continuing to the real xref.
$ python pdfsizeopt.py ColumnSelections.pdf
info: loading PDF from: ColumnSelections.pdf
info: loaded PDF of 1058676 bytes
warning: problem with xref table, finding objs anyway: xref subsection
syntax error at 1058495
Traceback (most recent call last):
File "pdfsizeopt.py", line 6158, in <module>
main(sys.argv)
File "pdfsizeopt.py", line 6134, in main
).Load(file_name)
File "pdfsizeopt.py", line 2997, in Load
data, do_ignore_generation_numbers=self.do_ignore_generation_numbers)
File "pdfsizeopt.py", line 3199, in ParseWithoutXref
assert prev_obj_num not in obj_starts, 'duplicate obj %d' % prev_obj_num
TypeError: int argument required
@@ -3092,6 +3092,12 @@
# For testing whitespace before trailer: enc.pdf
match = re.match(
r'(\d+)\s+([1-9]\d*)\s+|[\0\t\n\r\f ]*(xref|trailer)\s',
xref_head)
+ old_object = False
+ if not match:
+ # skip over old objects
+ old_object = re.match(
+ r'(\d+)\s+(0)\s+|[\0\t\n\r\f ]*(xref|trailer)\s', xref_head)
+ match = old_object
if not match:
raise PdfXrefError('xref subsection syntax error at %d' % xref_ofs)
if match.group(3) is not None:
@@ -3122,7 +3128,7 @@
obj_ofs = 0
else:
raise PdfXrefError('duplicate obj %s' % obj_num)
- if obj_ofs != 0:
+ if (obj_ofs != 0) and (not old_object):
# for testing: obj 10 in pdfsizeopt_charts.pdf has offset 0:
# "0000000000 00000 n \n"
if obj_ofs in obj_starts_rev:
pngout returns 2 instead of 0 if it can not compress the file. This is
just a warning, and pdfsizeopt can still continue.
$ python pdfsizeopt.py ColumnSelections.pdf
...
info: executing image optimizer jbig2: jbig2 -p pso.conv-16.sam2p-pr.png
>pso.conv-16.jbig2
info: executing image optimizer pngout: pngout pso.conv-16.parse.png
pso.conv-16.pngout.png
In: 87 bytes pso.conv-16.parse.png /c3 /f0 /d1
Out: 91 bytes pso.conv-16.pngout.png /c3 /f0 /d1, 1 colors
Unable to compress further: copying original file
info: pngout failed, status=0x200
Traceback (most recent call last):
File "pdfsizeopt.py", line 6167, in <module>
main(sys.argv)
File "pdfsizeopt.py", line 6153, in main
pdf.OptimizeImages(use_pngout=use_pngout, use_jbig2=use_jbig2)
File "pdfsizeopt.py", line 5082, in OptimizeImages
cmd_name='pngout'))
File "pdfsizeopt.py", line 4551, in ConvertImage
assert 0, '%s failed (status)' % cmd_name
AssertionError: pngout failed (status)
@@ -4536,7 +4543,10 @@
print >>sys.stderr, ('info: executing image optimizer %s: %s' %
(cmd_name, cmd))
status = os.system(cmd)
- if status:
+ if status == 0x200:
+ # pngout returns 2 if it can not compress the file further, but this
is not an error
+ print >>sys.stderr, 'info: %s returned, status=0x%x, continuing' %
(cmd_name, status)
+ elif status:
print >>sys.stderr, 'info: %s failed, status=0x%x' % (cmd_name, status)
assert 0, '%s failed (status)' % cmd_name
assert os.path.exists(targetfn), (
William
williambader@hotmail.com
1.0 MB Download