My favorites | Sign in
Project Home Downloads Wiki Issues Source
READ-ONLY: This project has been archived. For more information see this post.
Search
for
  Advanced search   Search tips   Subscriptions

Issue 33 attachment: pdfsizeopt.pat (9.6 KB)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
--- pdfsizeopt/pdfsizeopt.py- 2010-03-24 21:06:15.000000000 +0000
+++ pdfsizeopt/pdfsizeopt.py 2010-03-30 01:55:38.000000000 +0100
@@ -248,7 +248,7 @@
"""Matches an `obj' definition, xref or startxref."""

NONNEGATIVE_INT_RE = re.compile(r'(-?\d+)')
- """Matches and captures a nonneagative integer."""
+ """Matches and captures a nonnegative integer."""

PDF_STARTXREF_EOF_RE = re.compile(
r'[>\0\t\n\r\f ]startxref\s+(\d+)(?:\s+%%EOF\s*)?\Z')
@@ -395,7 +395,7 @@
scanner = self.LENGTH_OF_STREAM_RE.scanner(head)
match = scanner.search()
if not match:
- # We happlily accept the invalid PDF obj
+ # We happily accept the invalid PDF obj
# `<</Foo[/Length 42]>>stream...endstream' above. This is OK, since
# we don't implement a validating PDF parser.
raise PdfTokenParseError(
@@ -1217,7 +1217,7 @@
hex <...>.
Returns:
The most compact PDF token sequence form of data: without superfluous
- whitespce; with '(' string literals. It may contain \n only in
+ whitespace; with '(' string literals. It may contain \n only in
string literals.
Raises:
PdfTokenParseError
@@ -1563,7 +1563,7 @@
not self.stream is not None or
self.Get('Subtype') != '/Form' or
self.Get('FormType', 1) != 1 or
- # !! get rid of these checks one we can decompress anything
+ # !! get rid of these checks once we can decompress anything
self.Get('Filter') not in (None, '/FlateDecode') or
self.Get('DecodeParms') is not None or
not str(self.Get('BBox')).startswith('[')): return None
@@ -1682,7 +1682,7 @@
start: Offset in data to start the parsing at.
end_ofs_out: None or a list for the first output byte
(which is unparsed) offset to be appended. Terminating whitespace is
- not included, except for a single withespace is only after
+ not included, except for a single whitespace is only after
do_terminate_obj.
do_terminate_obj: Boolean indicating whether look for and include the
`stream' or `endobj' (or any other non-literal name)
@@ -1933,7 +1933,7 @@
if i < data_size:
i += 1 # Don't increase it further.
else:
- raise PdfTokenParseError('syntax error, expecing PDF token, got %r' %
+ raise PdfTokenParseError('syntax error, expecting PDF token, got %r' %
data[i])

assert i <= data_size
@@ -2211,7 +2211,7 @@
(((-operand - 108) >> 8) + 251, (-operand - 108) & 255))
assert 251 <= ord(output[-1][0]) <= 254
elif -32768 <= operand <= 32767:
- output.append(chr(28) + struct.pack('>H', operand))
+ output.append(chr(28) + struct.pack('>H', operand & 0xFFFF))
elif ~0x7fffffff <= operand <= 0x7fffffff:
output.append(chr(29) + struct.pack('>L', operand))
else:
@@ -2508,7 +2508,7 @@
assert 0, 'cannot convert to PDF color space'

def GetPdfImageData(self):
- """Return a dictinary useful as a PDF image."""
+ """Return a dictionary useful as a PDF image."""
assert self.CanBePdfImage() # asserts not interlaced
pdf_image_data = {
'Width': self.width,
@@ -2600,7 +2600,7 @@
pdf_obj.stream = pdf_image_data['.stream']

def CompressToZipPng(self):
- """Compress self.idat to self.compresson = 'zip-png'."""
+ """Compress self.idat to self.compression = 'zip-png'."""
assert self
if self.compression == 'zip-png':
# For testing: ./pdfsizeopt.py --use-jbig2=false --use-pngout=false pts2ep.pdf
@@ -2981,7 +2981,7 @@
self.file_size = len(data)
match = PdfObj.PDF_VERSION_HEADER_RE.match(data)
if not match:
- raise PdfTokenParseError('uncrecognized PDF signature %r' % data[0: 16])
+ raise PdfTokenParseError('unrecognized PDF signature %r' % data[0: 16])
self.version = match.group(1)
self.objs = {}
self.trailer = None
@@ -3092,6 +3092,12 @@
# For testing whitespace before trailer: enc.pdf
match = re.match(
r'(\d+)\s+([1-9]\d*)\s+|[\0\t\n\r\f ]*(xref|trailer)\s', xref_head)
+ old_object = False
+ if not match:
+ # skip over old objects
+ old_object = re.match(
+ r'(\d+)\s+(0)\s+|[\0\t\n\r\f ]*(xref|trailer)\s', xref_head)
+ match = old_object
if not match:
raise PdfXrefError('xref subsection syntax error at %d' % xref_ofs)
if match.group(3) is not None:
@@ -3122,7 +3128,7 @@
obj_ofs = 0
else:
raise PdfXrefError('duplicate obj %s' % obj_num)
- if obj_ofs != 0:
+ if (obj_ofs != 0) and (not old_object):
# for testing: obj 10 in pdfsizeopt_charts.pdf has offset 0:
# "0000000000 00000 n \n"
if obj_ofs in obj_starts_rev:
@@ -3311,7 +3317,7 @@
print >>sys.stderr, 'info: generated %s bytes (%s)' % (
GetOutputSize(), FormatPercent(GetOutputSize(), self.file_size))

- # TODO(pts): Don't keep enverything in memory.
+ # TODO(pts): Don't keep everything in memory.
f.write(''.join(output))
finally:
f.close()
@@ -3528,7 +3534,7 @@
% We want to make sure that:
%
% S1. All glyphs in /CharStrings are part of the /Encoding array. This is
- % needed for Ghostsccript 8.54, which would sometimes generate two (or
+ % needed for Ghostscript 8.54, which would sometimes generate two (or
% more?) PDF font objects if not all glyphs are encoded.
%
% S2. All non-/.notdef elements of the /Encoding array remain unchanged.
@@ -3861,6 +3867,7 @@
% <streamdict> <decompressed-file>
systemdict /FontDirectory get {pop undefinefont} forall
dup /MY exch LoadCff
+ dup type /dicttype eq { pop } if % if LoadCff leaves a dictionary, pop it
closefile % is this needed?
% <streamdict>
pop
@@ -4536,7 +4543,10 @@
print >>sys.stderr, ('info: executing image optimizer %s: %s' %
(cmd_name, cmd))
status = os.system(cmd)
- if status:
+ if status == 0x200:
+ # pngout returns 2 if it can not compress the file further, but this is not an error
+ print >>sys.stderr, 'info: %s returned, status=0x%x, continuing' % (cmd_name, status)
+ elif status:
print >>sys.stderr, 'info: %s failed, status=0x%x' % (cmd_name, status)
assert 0, '%s failed (status)' % cmd_name
assert os.path.exists(targetfn), (
@@ -5018,7 +5028,7 @@
obj_images.append(self.ConvertImage(
sourcefn=rendered_image_file_name,
targetfn='pso.conv-%d.sam2p-np.pdf' % obj_num,
- # We specify -s here to explicitly exclue SF_Opaque for single-color
+ # We specify -s here to explicitly exclude SF_Opaque for single-color
# images.
# !! do we need /ImageMask parsing if we exclude SF_Mask here as well?
# Original sam2p order: Opaque:Transparent:Gray1:Indexed1:Mask:Gray2:Indexed2:Rgb1:Gray4:Indexed4:Rgb2:Gray8:Indexed8:Rgb4:Rgb8:Transparent2:Transparent4:Transparent8
@@ -5184,7 +5194,7 @@
A new dict mapping object numbers to PdfObj instances.
"""
# List of list of desc ([obj_num, head_minus, stream, refs_to,
- # inrefs_count]). Each list of eqclasses is an eqivalence class of
+ # inrefs_count]). Each list of eqclasses is an equivalence class of
# object descs.
eqclasses = []
# Maps object numbers to an element of eqclasses.
@@ -5213,8 +5223,14 @@
form = (head_minus, stream)
form_desc = by_form.get(form)
if form_desc is not None:
- form_desc.append(desc)
- eqclass_of[obj_num] = form_desc
+ if (desc[2] is None and
+ objs[obj_num].head.startswith('<<') and
+ objs[obj_num].Get('Type') == "/Page"):
+ eqclasses.append([desc])
+ eqclass_of[obj_num] = by_form[form] = eqclasses[-1]
+ else:
+ form_desc.append(desc)
+ eqclass_of[obj_num] = form_desc
else:
eqclasses.append([desc])
eqclass_of[obj_num] = by_form[form] = eqclasses[-1]
@@ -5415,10 +5431,10 @@

match = PdfObj.PDF_VERSION_HEADER_RE.match(data)
if not match:
- raise PdfTokenParseError('uncrecognized PDF signature %r' % data[0: 16])
+ raise PdfTokenParseError('unrecognized PDF signature %r' % data[0: 16])
version = match.group(1)

- # We set startxref ofs if avaialable. It is not an error not to have it
+ # We set startxref ofs if available. It is not an error not to have it
# (e.g. with a broken PDF with xref + trailer).
trailer_ofs = None
i = data.rfind('startxref')
@@ -5596,7 +5612,7 @@
raise PdfTokenParseError('invalid ref %r' % (ref_data,))
if int(match.group(2)) != 0:
raise PdfTokenParseError(
- 'invalid ref geneartion in %r' % (ref_data,))
+ 'invalid ref generation in %r' % (ref_data,))
set_obj.add(int(match.group(1)))

pdf = PdfData()
@@ -5952,7 +5968,10 @@
assert 0, 'Multivalent.jar not found, see above'
assert ':' not in multivalent_jar # $CLASSPATH separator

- multivalent_cmd = 'java -cp %s tool.pdf.Compress %s' % (
+ # -nocore14 removes embedded copies of the 14 core fonts
+ # -noalt strips alternate images
+ # -nopagepiece strips page piece data (stored by applications but not needed for viewing)
+ multivalent_cmd = 'java -cp %s tool.pdf.Compress -nocore14 -noalt -nopagepiece %s' % (
ShellQuoteFileName(multivalent_jar),
ShellQuoteFileName(in_pdf_tmp_file_name))
print >>sys.stderr, (
Powered by Google Project Hosting