My favorites | Sign in
Project Home Downloads Wiki Issues Source
READ-ONLY: This project has been archived. For more information see this post.
Search
for
  Advanced search   Search tips   Subscriptions

Issue 36 attachment: pdfsizeopt.pat (13.1 KB)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
--- pdfsizeopt/pdfsizeopt.py- 2010-03-24 21:06:15.000000000 +0000
+++ pdfsizeopt/pdfsizeopt.py 2010-05-09 03:52:06.000000000 +0100
@@ -248,7 +248,7 @@
"""Matches an `obj' definition, xref or startxref."""

NONNEGATIVE_INT_RE = re.compile(r'(-?\d+)')
- """Matches and captures a nonneagative integer."""
+ """Matches and captures a nonnegative integer."""

PDF_STARTXREF_EOF_RE = re.compile(
r'[>\0\t\n\r\f ]startxref\s+(\d+)(?:\s+%%EOF\s*)?\Z')
@@ -395,7 +395,7 @@
scanner = self.LENGTH_OF_STREAM_RE.scanner(head)
match = scanner.search()
if not match:
- # We happlily accept the invalid PDF obj
+ # We happily accept the invalid PDF obj
# `<</Foo[/Length 42]>>stream...endstream' above. This is OK, since
# we don't implement a validating PDF parser.
raise PdfTokenParseError(
@@ -1217,7 +1217,7 @@
hex <...>.
Returns:
The most compact PDF token sequence form of data: without superfluous
- whitespce; with '(' string literals. It may contain \n only in
+ whitespace; with '(' string literals. It may contain \n only in
string literals.
Raises:
PdfTokenParseError
@@ -1563,7 +1563,7 @@
not self.stream is not None or
self.Get('Subtype') != '/Form' or
self.Get('FormType', 1) != 1 or
- # !! get rid of these checks one we can decompress anything
+ # !! get rid of these checks once we can decompress anything
self.Get('Filter') not in (None, '/FlateDecode') or
self.Get('DecodeParms') is not None or
not str(self.Get('BBox')).startswith('[')): return None
@@ -1682,7 +1682,7 @@
start: Offset in data to start the parsing at.
end_ofs_out: None or a list for the first output byte
(which is unparsed) offset to be appended. Terminating whitespace is
- not included, except for a single withespace is only after
+ not included, except for a single whitespace is only after
do_terminate_obj.
do_terminate_obj: Boolean indicating whether look for and include the
`stream' or `endobj' (or any other non-literal name)
@@ -1933,7 +1933,7 @@
if i < data_size:
i += 1 # Don't increase it further.
else:
- raise PdfTokenParseError('syntax error, expecing PDF token, got %r' %
+ raise PdfTokenParseError('syntax error, expecting PDF token, got %r' %
data[i])

assert i <= data_size
@@ -2211,7 +2211,7 @@
(((-operand - 108) >> 8) + 251, (-operand - 108) & 255))
assert 251 <= ord(output[-1][0]) <= 254
elif -32768 <= operand <= 32767:
- output.append(chr(28) + struct.pack('>H', operand))
+ output.append(chr(28) + struct.pack('>H', operand & 0xFFFF))
elif ~0x7fffffff <= operand <= 0x7fffffff:
output.append(chr(29) + struct.pack('>L', operand))
else:
@@ -2508,7 +2508,7 @@
assert 0, 'cannot convert to PDF color space'

def GetPdfImageData(self):
- """Return a dictinary useful as a PDF image."""
+ """Return a dictionary useful as a PDF image."""
assert self.CanBePdfImage() # asserts not interlaced
pdf_image_data = {
'Width': self.width,
@@ -2592,15 +2592,17 @@
else:
pdf_obj.Set('BitsPerComponent', pdf_image_data['BitsPerComponent'])
pdf_obj.Set('ColorSpace', pdf_image_data['ColorSpace'])
- pdf_obj.Set('Decode', pdf_image_data.get('Decode'))
+ if pdf_obj.Get('Decode') == None:
+ # Update Decode only if it is currently not set
+ pdf_obj.Set('Decode', pdf_image_data.get('Decode'))
pdf_obj.Set('Filter', pdf_image_data['Filter'])
pdf_obj.Set('DecodeParms', pdf_image_data.get('DecodeParms'))
pdf_obj.Set('Length', len(pdf_image_data['.stream']))
- # Don't pdf_obj.Set('Decode', ...): it is goot as is.
+ # Don't pdf_obj.Set('Decode', ...): it is good as is.
pdf_obj.stream = pdf_image_data['.stream']

def CompressToZipPng(self):
- """Compress self.idat to self.compresson = 'zip-png'."""
+ """Compress self.idat to self.compression = 'zip-png'."""
assert self
if self.compression == 'zip-png':
# For testing: ./pdfsizeopt.py --use-jbig2=false --use-pngout=false pts2ep.pdf
@@ -2981,7 +2983,7 @@
self.file_size = len(data)
match = PdfObj.PDF_VERSION_HEADER_RE.match(data)
if not match:
- raise PdfTokenParseError('uncrecognized PDF signature %r' % data[0: 16])
+ raise PdfTokenParseError('unrecognized PDF signature %r' % data[0: 16])
self.version = match.group(1)
self.objs = {}
self.trailer = None
@@ -3092,6 +3094,12 @@
# For testing whitespace before trailer: enc.pdf
match = re.match(
r'(\d+)\s+([1-9]\d*)\s+|[\0\t\n\r\f ]*(xref|trailer)\s', xref_head)
+ old_object = False
+ if not match:
+ # skip over old objects
+ old_object = re.match(
+ r'(\d+)\s+(0)\s+|[\0\t\n\r\f ]*(xref|trailer)\s', xref_head)
+ match = old_object
if not match:
raise PdfXrefError('xref subsection syntax error at %d' % xref_ofs)
if match.group(3) is not None:
@@ -3122,7 +3130,7 @@
obj_ofs = 0
else:
raise PdfXrefError('duplicate obj %s' % obj_num)
- if obj_ofs != 0:
+ if (obj_ofs != 0) and (not old_object):
# for testing: obj 10 in pdfsizeopt_charts.pdf has offset 0:
# "0000000000 00000 n \n"
if obj_ofs in obj_starts_rev:
@@ -3311,7 +3319,7 @@
print >>sys.stderr, 'info: generated %s bytes (%s)' % (
GetOutputSize(), FormatPercent(GetOutputSize(), self.file_size))

- # TODO(pts): Don't keep enverything in memory.
+ # TODO(pts): Don't keep everything in memory.
f.write(''.join(output))
finally:
f.close()
@@ -3417,6 +3425,16 @@
% don't redefine `stream'
} bind def

+% Avoid errors if any extra indirect references are included
+/R where {
+ pop
+}
+{
+ /R { % <objnumber> <gennumber> R <indirectobject> (but just <objnumber> for now)
+ pop
+ } bind def
+} ifelse
+
% Sort an array, from Ghostscript's prfont.ps.
/Sort { % <array> <lt-proc> Sort <array>
% Heapsort (algorithm 5.2.3H, Knuth vol. 2, p. 146),
@@ -3528,7 +3546,7 @@
% We want to make sure that:
%
% S1. All glyphs in /CharStrings are part of the /Encoding array. This is
- % needed for Ghostsccript 8.54, which would sometimes generate two (or
+ % needed for Ghostscript 8.54, which would sometimes generate two (or
% more?) PDF font objects if not all glyphs are encoded.
%
% S2. All non-/.notdef elements of the /Encoding array remain unchanged.
@@ -3861,6 +3879,7 @@
% <streamdict> <decompressed-file>
systemdict /FontDirectory get {pop undefinefont} forall
dup /MY exch LoadCff
+ dup type /dicttype eq { pop } if % if LoadCff leaves a dictionary, pop it
closefile % is this needed?
% <streamdict>
pop
@@ -4277,6 +4296,33 @@
assert obj.stream is None
parsed_font = parsed_fonts[obj_num]
parsed_font['FontName'] = obj.Get('FontName')
+ if parsed_font['FontType'] != 2:
+ print >>sys.stderr, 'info: font %s is not Type 2, can not merge.' % parsed_font['FontName']
+ continue
+ if 'CharStrings' not in parsed_font:
+ print >>sys.stderr, 'info: font %s does not have CharStrings, can not merge.' % parsed_font['FontName']
+ continue
+ if 'FontMatrix' not in parsed_font:
+ print >>sys.stderr, 'info: font %s has no FontMatrix, can not merge.' % parsed_font['FontName']
+ continue
+ if 'Private' not in parsed_font:
+ print >>sys.stderr, 'info: can font %s has no Private data, can not merge.' % parsed_font['FontName']
+ continue
+ if 'PaintType' not in parsed_font:
+ print >>sys.stderr, 'info: font %s has no PaintType, can not merge.' % parsed_font['FontName']
+ continue
+ if 'FontInfo' not in parsed_font:
+ print >>sys.stderr, 'info: font %s has no FontInfo, can not merge.' % parsed_font['FontName']
+ continue
+ if 'CharStrings' not in parsed_font:
+ print >>sys.stderr, 'info: font %s has no CharStrings, can not merge.' % parsed_font['FontName']
+ continue
+ if 'Subrs' in parsed_font:
+ print >>sys.stderr, 'info: font %s has Subrs, can not merge.' % parsed_font['FontName']
+ continue
+ if 'Subrs' in parsed_font['Private']:
+ print >>sys.stderr, 'info: font %s has Private Subrs, can not merge.' % parsed_font['FontName']
+ continue
assert parsed_font['FontType'] == 2
assert 'CharStrings' in parsed_font
assert 'FontMatrix' in parsed_font
@@ -4536,7 +4582,10 @@
print >>sys.stderr, ('info: executing image optimizer %s: %s' %
(cmd_name, cmd))
status = os.system(cmd)
- if status:
+ if status == 0x200:
+ # pngout returns 2 if it can not compress the file further, but this is not an error
+ print >>sys.stderr, 'info: %s returned, status=0x%x, continuing' % (cmd_name, status)
+ elif status:
print >>sys.stderr, 'info: %s failed, status=0x%x' % (cmd_name, status)
assert 0, '%s failed (status)' % cmd_name
assert os.path.exists(targetfn), (
@@ -4596,8 +4645,8 @@
image_obj.Get('ImageMask', False) is True)
#if image_obj.Get('Filter') == '/FlateDecode':
# If we do a zlib.decompress(stream) now, it will succeed even if stream
- # has trailing garbage. But zlib.decompress(steram[:-1]) would fail. In
- # Python, there is no way the get te real end on the compressed zlib
+ # has trailing garbage. But zlib.decompress(stream[:-1]) would fail. In
+ # Python, there is no way to get the real end of the compressed zlib
# stream (see also http://www.faqs.org/rfcs/rfc1950.html and
# http://www.faqs.org/rfcs/rfc1951.html). We may just check the last
# 4 bytes (adler32).
@@ -5018,7 +5067,7 @@
obj_images.append(self.ConvertImage(
sourcefn=rendered_image_file_name,
targetfn='pso.conv-%d.sam2p-np.pdf' % obj_num,
- # We specify -s here to explicitly exclue SF_Opaque for single-color
+ # We specify -s here to explicitly exclude SF_Opaque for single-color
# images.
# !! do we need /ImageMask parsing if we exclude SF_Mask here as well?
# Original sam2p order: Opaque:Transparent:Gray1:Indexed1:Mask:Gray2:Indexed2:Rgb1:Gray4:Indexed4:Rgb2:Gray8:Indexed8:Rgb4:Rgb8:Transparent2:Transparent4:Transparent8
@@ -5184,7 +5233,7 @@
A new dict mapping object numbers to PdfObj instances.
"""
# List of list of desc ([obj_num, head_minus, stream, refs_to,
- # inrefs_count]). Each list of eqclasses is an eqivalence class of
+ # inrefs_count]). Each list of eqclasses is an equivalence class of
# object descs.
eqclasses = []
# Maps object numbers to an element of eqclasses.
@@ -5213,8 +5262,14 @@
form = (head_minus, stream)
form_desc = by_form.get(form)
if form_desc is not None:
- form_desc.append(desc)
- eqclass_of[obj_num] = form_desc
+ if (desc[2] is None and
+ objs[obj_num].head.startswith('<<') and
+ objs[obj_num].Get('Type') == "/Page"):
+ eqclasses.append([desc])
+ eqclass_of[obj_num] = by_form[form] = eqclasses[-1]
+ else:
+ form_desc.append(desc)
+ eqclass_of[obj_num] = form_desc
else:
eqclasses.append([desc])
eqclass_of[obj_num] = by_form[form] = eqclasses[-1]
@@ -5415,10 +5470,10 @@

match = PdfObj.PDF_VERSION_HEADER_RE.match(data)
if not match:
- raise PdfTokenParseError('uncrecognized PDF signature %r' % data[0: 16])
+ raise PdfTokenParseError('unrecognized PDF signature %r' % data[0: 16])
version = match.group(1)

- # We set startxref ofs if avaialable. It is not an error not to have it
+ # We set startxref ofs if available. It is not an error not to have it
# (e.g. with a broken PDF with xref + trailer).
trailer_ofs = None
i = data.rfind('startxref')
@@ -5596,7 +5651,7 @@
raise PdfTokenParseError('invalid ref %r' % (ref_data,))
if int(match.group(2)) != 0:
raise PdfTokenParseError(
- 'invalid ref geneartion in %r' % (ref_data,))
+ 'invalid ref generation in %r' % (ref_data,))
set_obj.add(int(match.group(1)))

pdf = PdfData()
@@ -5952,7 +6007,10 @@
assert 0, 'Multivalent.jar not found, see above'
assert ':' not in multivalent_jar # $CLASSPATH separator

- multivalent_cmd = 'java -cp %s tool.pdf.Compress %s' % (
+ # -nocore14 removes embedded copies of the 14 core fonts
+ # -noalt strips alternate images
+ # -nopagepiece strips page piece data (stored by applications but not needed for viewing)
+ multivalent_cmd = 'java -cp %s tool.pdf.Compress -nocore14 -noalt -nopagepiece %s' % (
ShellQuoteFileName(multivalent_jar),
ShellQuoteFileName(in_pdf_tmp_file_name))
print >>sys.stderr, (
Powered by Google Project Hosting