forked from johnwhitington/camlpdf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdfread.ml
1948 lines (1852 loc) · 73 KB
/
pdfread.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
(* This module can read PDF files into the format given by the [Pdf] module *)
open Pdfutil
open Pdfio
open Pdfgenlex
let read_debug = ref false
let error_on_malformed = ref false
let debug_always_treat_malformed = ref false
(* Predicate on newline characters (carriage return and linefeed). *)
let is_newline = function
| '\010' | '\013' -> true
| _ -> false
let b = Buffer.create 30
let input_line i =
Buffer.clear b;
let goteol = ref false
and finished = ref false in
while not !finished do
match i.input_byte () with
| x when x = Pdfio.no_more -> set finished
| x ->
let c = Char.unsafe_chr x in
if is_newline c then set goteol
else if !goteol then (rewind i; set finished)
else Buffer.add_char b c
done;
Buffer.contents b
(* Read back until a predicate is fulfilled. *)
let rec read_back_until p i =
if
(notpred p)
(match read_char_back i with Some x -> x | None -> raise End_of_file)
then read_back_until p i
(* Go back one line. In other words, find the second EOL character group
seeking back in the file, and seek to the character after it. A blank line
after a line with a single EOL character will be treated as being part of that
EOL. *)
let backline i =
read_back_until is_newline i;
read_back_until (notpred is_newline) i;
read_back_until is_newline i;
nudge i
(* Read the major and minor version numbers *)
let get8chars i =
let c1 = i.input_char () in
let c2 = i.input_char () in
let c3 = i.input_char () in
let c4 = i.input_char () in
let c5 = i.input_char () in
let c6 = i.input_char () in
let c7 = i.input_char () in
let c8 = i.input_char () in
try map unopt [c1; c2; c3; c4; c5; c6; c7; c8] with _ -> []
let rec read_header_inner pos i =
try
if pos > 1024 then raise End_of_file else
i.Pdfio.seek_in pos;
match get8chars i with
| '%'::'P'::'D'::'F'::'-'::major::'.'::minor ->
let minorchars = takewhile isdigit minor in
if minorchars = []
then
raise
(Pdf.PDFError (Pdf.input_pdferror i "Malformed PDF header"))
else
begin
if !read_debug then Pdfe.log (Printf.sprintf "setting offset to %i\n" pos);
i.set_offset pos;
int_of_string (string_of_char major), int_of_string (implode minorchars)
end
| _ ->
read_header_inner (pos + 1) i
with
End_of_file | Failure _ (*"int_of_string"*) -> (2, 0)
let read_header =
read_header_inner 0
(* Find the EOF marker, and move position to its first character. We allow 1024
bytes from end-of-file for compatibility with Acrobat. *)
let find_eof i =
let fail () =
raise (Pdf.PDFError (Pdf.input_pdferror i "Could not find EOF marker"))
in
let pos = ref (i.in_channel_length - 4) in
try
let notfound = ref true
in let tries = ref !pos in (* unlike Acrobat, we check whole file *)
while !notfound do
pos := !pos - 1;
i.seek_in !pos;
if !tries < 0 then fail () else decr tries;
let a = i.input_byte () in
let b = i.input_byte () in
let c = i.input_byte () in
let d = i.input_byte () in
let e = i.input_byte () in
if a = int_of_char '%' && b = int_of_char '%'
&& c = int_of_char 'E' && d = int_of_char 'O'
&& e = int_of_char 'F'
then clear notfound
done;
i.seek_in !pos;
with
_ -> fail ()
(* String of lexeme. *)
let string_of_lexeme = function
| LexNull -> "null"
| LexBool b -> Pdfwrite.string_of_pdf (Pdf.Boolean b)
| LexInt i -> Pdfwrite.string_of_pdf (Pdf.Integer i)
| LexReal f -> Pdfwrite.string_of_pdf (Pdf.Real f)
| LexString s -> Pdfwrite.string_of_pdf (Pdf.String s)
| LexName s -> s
| LexLeftSquare -> "["
| LexRightSquare -> "]"
| LexLeftDict -> "<<"
| LexRightDict -> ">>"
| LexStream _ -> "LexStream"
| LexEndStream -> "EndStream"
| LexObj -> "obj"
| LexEndObj -> "endobj"
| LexR -> "R"
| LexComment s -> "\n%" ^ s ^ "\n"
| StopLexing -> "StopLexing"
| LexNone -> "LexNone"
let print_lexeme l =
Printf.printf "%s " (string_of_lexeme l)
(* Predicate on whitespace and delimiters. *)
let is_whitespace_or_delimiter c =
Pdf.is_whitespace c || Pdf.is_delimiter c
(* Return the list of characters between and including the current position and
before the next character satisfying a given predicate, leaving the position at
the character following the last one returned. Can raise [EndOfInput]. If [eoi]
is true, end of input is considered a delimiter, and the characters up to it are
returned if it is reached. *)
let getuntil eoi f i =
let rec getuntil_inner r eoi f i =
match i.input_byte () with
| x when x = Pdfio.no_more ->
if eoi then rev r else raise End_of_file
| x ->
let chr = char_of_int x in
if f chr
then (rewind i; rev r)
else getuntil_inner (chr::r) eoi f i
in
getuntil_inner [] eoi f i
let b = Buffer.create 30
let getuntil_string eoi f i =
let rec getuntil_inner_string b eoi f i =
match i.input_byte () with
| x when x = Pdfio.no_more ->
if eoi then Buffer.contents b else raise End_of_file
| x ->
let chr = char_of_int x in
if f chr
then (rewind i; Buffer.contents b)
else getuntil_inner_string (Buffer.add_char b chr; b) eoi f i
in
Buffer.clear b;
getuntil_inner_string b eoi f i
(* The same, but don't return anything. *)
let rec ignoreuntil eoi f i =
match i.input_byte () with
| x when x = Pdfio.no_more -> if eoi then () else raise End_of_file
| x -> if f (Char.unsafe_chr x) then rewind i else ignoreuntil eoi f i
(* Ignore until the next whitespace *)
let ignoreuntilwhite =
ignoreuntil true Pdf.is_whitespace
(* Position on the next non-whitespace character. *)
let dropwhite i =
ignoreuntil true Pdf.is_not_whitespace i
(* The same, but stop at array, dictionary endings etc. *)
let getuntil_white_or_delimiter =
getuntil true is_whitespace_or_delimiter
let getuntil_white_or_delimiter_string =
getuntil_string true is_whitespace_or_delimiter
(* Lexing *)
(* Each of the following functions lexes a particular object, leaving the
channel position at the character after the end of the lexeme. Upon entry, the
file position is on the first character of the potential lexeme. *)
(* Lex a bool. *)
let lex_bool i =
match getuntil_white_or_delimiter i with
| ['t'; 'r'; 'u'; 'e'] -> LexBool true
| ['f'; 'a'; 'l'; 's'; 'e'] -> LexBool false
| _ -> LexNone
(* Lex an int or float. See PDF manual for details of policy. *)
let lex_number i =
let pos = i.pos_in () in
try
match Pdfgenlex.lex_single i with
| Pdfgenlex.LexInt i -> LexInt i
| Pdfgenlex.LexReal f -> LexReal f
| _ -> LexNone
with
| Failure x when x = "hd" -> LexNone
| Pdf.PDFError _ (* can't cope with floats with leading point. *)
| Failure _ (*"int_of_string"*) ->
LexReal
(float_of_string
(i.seek_in pos; (getuntil_white_or_delimiter_string i)))
(* [float_of_string] never fails. *)
(* Lex a name. *)
let b = Buffer.create 30
let lex_name i =
Buffer.clear b;
nudge i;
Buffer.add_char b '/';
let fini = ref false in
while not !fini do
match i.input_byte () with
| x when x = Pdfio.no_more -> set fini
| x ->
let c = Char.unsafe_chr x in
if is_whitespace_or_delimiter c then
begin rewind i; set fini end
else if c = '#' then
begin
let a = i.input_byte () in
let a2 = i.input_byte () in
if a <> Pdfio.no_more && a2 <> Pdfio.no_more then
Buffer.add_char b
(char_of_int
(int_of_string
("0x" ^ string_of_char (char_of_int a) ^
string_of_char (char_of_int a2))))
end
else Buffer.add_char b c
done;
LexName (Buffer.contents b)
(* Lex a comment. We throw away everything from here until a new line. In the
case of a CRLF, only the CR is consumed, but the LF will be consumed before the
next token is read anyway, so this is fine. *)
let lex_comment i =
ignoreuntil false is_newline i;
LexComment ""
(* Lex a string. A string is between parenthesis. Unbalanced parenthesis in the
string must be escaped, but balanced ones need not be. We convert escaped
characters to the characters themselves. A newline sequence following a
backslash represents a newline. The string is returned without its enclosing
parameters. *)
(* PDF strings can contain characters as a backslash followed by up to three
octal characters. If there are fewer than three, the next character in the file
cannot be a digit (The format is ambiguous as to whether this means an octal
digit --- we play safe and allow non-octal digits). This replaces these
sequences of characters by a single character as used by OCaml in its native
strings.
Beware malformed strings. For instance, Reader accepts ((\\(ISA)) *)
(* Build a character from a list of octal digits. *)
let mkchar l =
try
char_of_int (int_of_string ("0o" ^ implode l))
with
_ -> raise (Pdf.PDFError ("mkchar"))
let str = Buffer.create 16
(* Main function. *)
let lex_string i =
try
Buffer.clear str;
let paren = ref 1
in let c = char_of_int (i.input_byte ()) in
assert (c = '(');
while !paren > 0 do
let c = char_of_int (i.input_byte ()) in
match c with
| '(' ->
incr paren; Buffer.add_char str c;
| ')' ->
decr paren; if !paren > 0 then Buffer.add_char str c;
| '\\' ->
let c' = char_of_int (i.input_byte ()) in
(match c' with
| 'n' -> Buffer.add_char str '\n'
| 'r' -> Buffer.add_char str '\r'
| 't' -> Buffer.add_char str '\t'
| 'b' -> Buffer.add_char str '\b'
| 'f' -> Buffer.add_char str '\012'
| '\r' ->
if char_of_int (i.input_byte ()) <> '\n' then
rewind i
| '\n' -> ()
| x when x >= '0' && x <= '7' ->
(* Replace octal character sequences with the real character. *)
let o2 = char_of_int (i.input_byte ()) in
(match o2 with
| y when y >= '0' && y <= '7' ->
let o3 = char_of_int (i.input_byte ()) in
(match o3 with
| z when z >= '0' && z <= '7' ->
Buffer.add_char str (mkchar [c'; o2; o3])
| _ ->
rewind i;
Buffer.add_char str (mkchar [c'; o2]))
| _ ->
rewind i;
Buffer.add_char str (mkchar [c']))
| _ -> (* including ['('], [')'], ['\\'], and all the others *)
Buffer.add_char str c' )
| _ ->
Buffer.add_char str c
done;
LexString (Buffer.contents str)
with
| Failure _ (*"unopt"*) ->
raise (Pdf.PDFError (Pdf.input_pdferror i "lex_string failure"))
(* Lex a hexadecimal string. *)
let lex_hexstring i =
let mkchar a b =
try
char_of_int (int_of_string ("0x" ^ implode [a; b]))
with
_ -> raise (Pdf.PDFError (Pdf.input_pdferror i "Lexing Hexstring"))
in
let str = Buffer.create 16 in
try
let _ = i.input_byte () (*r skip start marker *)
in let finished = ref false in
let addchar = Buffer.add_char str in
let rec input_next_char () =
let x = i.input_byte () in
if x = -1 then raise End_of_file else
let c = char_of_int x in
if Pdf.is_whitespace c then input_next_char () else c
in
while not !finished do
let c = input_next_char () in
let c' = input_next_char () in
match c, c' with
| '>', _ -> rewind i; set finished
| a, '>' -> addchar (mkchar a '0'); set finished
| a, b -> addchar (mkchar a b)
done;
LexString (Buffer.contents str)
with
| End_of_file -> LexString (Buffer.contents str)
| Failure _ (*"unopt"*) ->
raise (Pdf.PDFError (Pdf.input_pdferror i "lex_hexstring"))
(* Lex a keyword. *)
let lex_keyword i =
match Pdfgenlex.lex_single i with
| Pdfgenlex.LexName "obj" -> LexObj
| Pdfgenlex.LexName "endobj" -> LexEndObj
| Pdfgenlex.LexName "R" -> LexR
| Pdfgenlex.LexName "null" -> LexNull
| Pdfgenlex.LexName "endstream" -> LexEndStream
| Pdfgenlex.LexName x ->
(* Some malformed files have endobj run together with the next object number *)
begin match explode x with
| 'e'::'n'::'d'::'o'::'b'::'j'::l -> iter (fun _ -> rewind i) l; LexEndObj
| _ -> LexNone
end
| l ->
Pdfe.log (Printf.sprintf "failed to lex keyword: %s\n" (Pdfgenlex.string_of_token l));
LexNone
(* Read some chars from a file. Leaves position as-is, except in the case of
reaching past the end of a file, in which case an exception is raised. *)
let read_chunk n i =
try
let orig_pos = i.pos_in () in
let s = String.init n (fun _ -> unopt (i.input_char ())) in
i.seek_in orig_pos;
s
with
_ -> raise (Failure "read_chunk")
(* Lex a stream, given its length (previously extracted by parsing the stream
dictionary). If [opt] is [true] the stream is actually read, if [false] a
[ToGet] tuple is created. The channel is positioned on the first character of
the stream keyword. *)
let is_malformed i =
try
match read_chunk 9 i with
| "endstream" -> false
| _ ->
match read_chunk 10 i with
| "\nendstream" | "\rendstream" -> false
| _ ->
match read_chunk 11 i with
| "\r\nendstream" -> false
| x -> true
with
_ -> true (* Beyond end of file - so, read_chunk would have failed... *)
let skip_stream_beginning i =
ignoreuntilwhite i; (* consume the 'stream' keyword *)
(* Ignore any white other than CR or LF. For malformed files which don't have
* CR or LF immediately following stream keyword. *)
ignoreuntil
true (function ' ' | '\000' | '\012' | '\009' -> false | _ -> true) i;
(* Skip either CRLF or LF. (See PDF specification for why). *)
match char_of_int (i.input_byte ()) with
| '\013' ->
begin match char_of_int (i.input_byte ()) with
| '\010' -> () (* It was CRLF *)
| _ -> rewind i (* No padding, happens to be CR *)
end
| '\010' -> () (* Just LF *)
| _ -> rewind i (* No padding. *)
(* Return position of first character for endstream sequence, or last character
of file if none exists. *)
let rec find_endstream i =
let rec match_chunk s n i =
n = String.length s
||
match i.input_char () with
| None -> false
| Some c when c = s.[n] -> match_chunk s (n + 1) i
| Some _ -> false
in
let match_chunk s i =
let pos = i.pos_in () in
let r = match_chunk s 0 i in
i.seek_in pos;
r
in
if
match_chunk "endstream" i || match_chunk "\nendstream" i ||
match_chunk "\r\nendstream" i || match_chunk "\rendstream" i
then
i.pos_in ()
else
(if i.pos_in () = i.in_channel_length
then i.pos_in ()
else (nudge i; find_endstream i))
let lex_malformed_stream_data i =
try
skip_stream_beginning i;
let curr = i.pos_in () in
let pos = find_endstream i in (* returns first char of endstream *)
i.seek_in curr;
let arr = mkbytes (pos - curr) in
for x = 0 to bytes_size arr - 1 do
bset_unsafe arr x (i.input_byte ())
done;
LexStream (Pdf.Got arr)
with
e ->
raise
(Pdf.PDFError ("Bad read malformed stream - " ^ Printexc.to_string e))
let lex_stream_data i l opt =
let original_pos = i.pos_in () in
try
skip_stream_beginning i;
let pos = i.pos_in () in
if opt then
let arr = mkbytes l in
if l > 0 then
setinit i arr 0 l;
i.seek_in (pos + l);
if is_malformed i
then (i.seek_in original_pos; lex_malformed_stream_data i)
else (i.seek_in (pos + l); LexStream (Pdf.Got arr))
else
begin
i.seek_in (pos + l);
if is_malformed i
then (i.seek_in original_pos; lex_malformed_stream_data i)
else (i.seek_in (pos + l); LexStream (Pdf.ToGet (Pdf.toget i pos l)))
end
with
_ -> raise (Pdf.PDFError (Pdf.input_pdferror i "lex_stream_data"))
(* Lex a stream. This involves parsing the stream dictionary to get the
length. [i] is at the start of the stream data, suitable for input to
[lex_stream_data]. We extract the dictionary by going through
[previous_lexemes], the reverse-order list of the lexemes already read. *)
let lex_stream i p previous_lexemes lexobj opt =
let fail () =
raise (Pdf.PDFError (Pdf.input_pdferror i "Failure lexing stream dict"))
in
let dictlexemes =
takewhile_reverse (function LexObj -> false | _ -> true) previous_lexemes
in
match p dictlexemes with
| _, Pdf.Dictionary a ->
let pos = i.pos_in () in
let rec findlength = function
| Pdf.Integer l -> Some l
| Pdf.Indirect k -> findlength (snd (p (lexobj k)))
| _ -> None
in
begin match lookup "/Length" a with
| None -> lex_malformed_stream_data i
| Some v ->
try
match findlength v with
| None -> lex_malformed_stream_data i
| Some l -> lex_stream_data i l opt
with
_ ->
(* When reading malformed files, /Length could be indirect,
and therefore not available. Treat as if it were available,
but incorrect. *)
i.seek_in pos;
lex_malformed_stream_data i
end
| _ -> fail ()
(* Find the next lexeme in the channel and return it. The latest-first lexeme
list [previous_lexemes] contains all things thus-far lexed. [dictlevel] is a
number representing the dictionary and/or array nesting level. If [endonstream]
is true, lexing ends upon encountering a [LexStream] lexeme. *)
let lex_next dictlevel arraylevel endonstream i previous_lexemes p opt lexobj =
try
dropwhite i;
let raw = i.input_byte () in
if raw = Pdfio.no_more then StopLexing else
let chr1 = Char.unsafe_chr raw in
rewind i;
match chr1 with
| '%' -> lex_comment i
| 't' | 'f' -> lex_bool i
| '/' -> lex_name i
| x when (x >= '0' && x <= '9') || x = '+' || x = '-' || x = '.' ->
lex_number i
| '[' -> nudge i; incr arraylevel; LexLeftSquare
| ']' -> nudge i; decr arraylevel; LexRightSquare
| '(' -> lex_string i
| '<' ->
let _ = char_of_int (i.input_byte ()) in
let chr2 = char_of_int (i.input_byte ()) in
rewind i; rewind i;
begin match chr2 with
| '<' -> nudge i; nudge i; incr dictlevel; LexLeftDict
| _ -> lex_hexstring i
end
| '>' ->
let _ = i.input_byte () in
let chr2 = char_of_int (i.input_byte ()) in
rewind i; rewind i;
begin match chr2 with
| '>' -> nudge i; nudge i; decr dictlevel; LexRightDict
| c -> LexNone
end
| 'R' -> nudge i; LexR
| 's' ->
(* Disambiguate "startxref" and "stream" on the third character. *)
let _ = i.input_byte () in
let _ = i.input_byte () in
let chr3 = char_of_int (i.input_byte ()) in
rewind i; rewind i; rewind i;
begin match chr3 with
| 'a' -> StopLexing (* startxref *)
| _ -> (* stream *)
if endonstream
then StopLexing
else lex_stream i p previous_lexemes lexobj opt
end
| x when x >= 'a' && x <= 'z' -> lex_keyword i
| 'I' -> StopLexing (* We've hit an ID marker in an inline image *)
| c -> (*Pdfe.log (Printf.sprintf "lexnone with character %C\n" c);*) LexNone
with
e ->
Pdfe.log (Printf.sprintf "Recovering from Lex error: %s\n" (Printexc.to_string e));
StopLexing
(* Lex just a dictionary, consuming only the tokens to the end of it. This is
used in the [PDFPages] module to read dictionaries in graphics streams. *)
let lex_dictionary minus_one i =
let rec lex_dictionary_getlexemes i lexemes dictlevel arraylevel =
let lex_dictionary_next i dictlevel arraylevel =
let dummyparse = fun _ -> 0, Pdf.Null
in let dummylexobj = fun _ -> [] in
(*Printf.eprintf "lex_dictionary_next: dictlevel = %i\n" !dictlevel;*)
let r =
lex_next dictlevel arraylevel false i [] dummyparse false dummylexobj
in
(*Printf.eprintf "Read lexeme %S\n" (string_of_lexeme r);
Printf.eprintf "after lex_dictionary_next: dictlevel = %i\n" !dictlevel;*)
r
in
match lex_dictionary_next i dictlevel arraylevel with
| LexRightDict when !dictlevel = (if minus_one then -1 else 0) && !arraylevel = 0 ->
rev (LexRightDict::lexemes)
| StopLexing ->
rev lexemes
| LexNone ->
raise
(Pdf.PDFError (Pdf.input_pdferror i "Could not read dictionary"))
| a ->
lex_dictionary_getlexemes i (a::lexemes) dictlevel arraylevel
in
lex_dictionary_getlexemes i [] (ref 0) (ref 0)
(* Calculate a list of lexemes from input [i], using parser [p] to lex
streams. Can raise [PDFError]. *)
let lex_object_at oneonly i opt p lexobj =
let dictlevel = ref 0
in let arraylevel = ref 0 in
let rec lex_object_at i lexemes =
let lexeme = lex_next dictlevel arraylevel false i lexemes p opt lexobj in
match lexeme with
| LexEndObj -> rev (lexeme::lexemes)
| StopLexing -> rev lexemes
| LexComment _ -> lex_object_at i (lexeme::lexemes)
| LexRightSquare | LexRightDict ->
if oneonly && !dictlevel = 0 && !arraylevel = 0
then
begin
let pos = i.pos_in () in
match
lex_next
dictlevel arraylevel false i (lexeme::lexemes) p opt lexobj
with
| LexStream s ->
begin match
lex_next
dictlevel arraylevel false i
(LexStream s::lexeme::lexemes) p opt lexobj
with
| LexEndStream ->
begin match
lex_next dictlevel arraylevel false i
(LexEndStream::LexStream s::lexeme::lexemes)
p opt lexobj
with
| LexEndObj ->
rev
(LexEndObj::LexEndStream::LexStream s::lexeme::lexemes)
| _ ->
rev
(LexEndObj::LexEndStream::LexStream s::lexeme::lexemes)
end
| _ ->
rev (LexEndObj::LexEndStream::LexStream s::lexeme::lexemes)
end
| _ -> i.seek_in pos; rev (lexeme::lexemes)
end
else lex_object_at i (lexeme::lexemes)
| LexNone ->
raise (Pdf.PDFError (Pdf.input_pdferror i "Could not read object"))
| LexInt i1 ->
(* Check for the case of "x y obj", which in the case of oneonly
should be returned as the one object. If i is followed by something
other than an integer and 'obj', we must rewind and just return the
integer *)
if oneonly && !dictlevel = 0 && !arraylevel = 0 then
let pos = i.pos_in () in
begin match
lex_next dictlevel arraylevel false i lexemes p opt lexobj
with
| LexInt i2 ->
begin match
lex_next dictlevel arraylevel false i lexemes p opt lexobj
with
| LexObj ->
lex_object_at i (LexObj::LexInt i2::LexInt i1::lexemes)
| _ ->
i.seek_in pos;
rev (LexInt i1::lexemes)
end
| _ ->
i.seek_in pos;
rev (LexInt i1::lexemes)
end
else
lex_object_at i (LexInt i1::lexemes)
| a ->
(* If oneonly, then can return if not in an array or dictionary and if
this lexeme was an atom. *)
let isatom = function
| LexBool _ | LexReal _ | LexString _ | LexName _ | LexNull -> true
| _ -> false
in
if oneonly && isatom a && !dictlevel = 0 && !arraylevel = 0
then rev (a::lexemes)
else lex_object_at i (a::lexemes)
in
lex_object_at i []
(* Type of sanitized cross-reference entries. They are either plain offsets, or
an object stream an index into it. *)
type xref =
| XRefPlain of int * int (* offset, generation. *)
| XRefStream of int * int (* object number of stream, index. *)
(*let string_of_xref = function
| XRefPlain (p, i) -> Printf.sprintf "XRefPlain (%i, %i)" p i
| XRefStream (o, i) -> Printf.sprintf "XrefStream %i, index %i" o i*)
let xrefs_table_create () = Hashtbl.create 1001
let xrefs_table_add_if_not_present table k v =
try ignore (Hashtbl.find table k) with
Not_found -> Hashtbl.add table k v
let xrefs_table_find table k =
try Some (Hashtbl.find table k) with
Not_found -> None
let xrefs_table_iter = Hashtbl.iter
(* [p] is the parser. Since this will be called from within functions it also
calls, we must store and retrieve the current file position on entry and exit.
*)
let rec lex_object i xrefs p opt n =
let current_pos = i.pos_in () in
let xref =
match xrefs_table_find xrefs n with
| Some x -> x
| None ->
raise
(Pdf.PDFError (Pdf.input_pdferror i "Object not in xref table"))
in
match xref with
| XRefStream (objstm, index) ->
assert false (*r lex object only used on XRefPlain entries *)
| XRefPlain (o, _) ->
i.seek_in o;
let result =
lex_object_at false i opt p (lex_object i xrefs p opt)
in
i.seek_in current_pos;
result
(* Parsing proceeds as a series of operations over lists of lexemes or parsed
objects. Parsing ends when the list is a singleton and its element is an
well-formed object. *)
type partial_parse_element =
| Lexeme of Pdfgenlex.t
| Parsed of Pdf.pdfobject
(*let print_parseme = function
| Parsed p ->
flprint "PARSED:";
print_string (Pdfwrite.string_of_pdf p);
flprint "\n"
| Lexeme l ->
flprint "LEXEME:";
print_lexeme l;
flprint "\n"*)
(* Parse stage one. *)
let parse_R ts =
let rec parse_R_inner r = function
| [] -> rev r
| LexInt o::LexInt _::LexR::rest ->
parse_R_inner (Parsed (Pdf.Indirect o)::r) rest
| LexComment _::t -> parse_R_inner r t
| LexNull::t -> parse_R_inner (Parsed Pdf.Null::r) t
| LexBool b::t -> parse_R_inner (Parsed (Pdf.Boolean b)::r) t
| LexInt i::t -> parse_R_inner (Parsed (Pdf.Integer i)::r) t
| LexReal f::t -> parse_R_inner (Parsed (Pdf.Real f)::r) t
| LexString s::t -> parse_R_inner (Parsed (Pdf.String s)::r) t
| LexName n::t -> parse_R_inner (Parsed (Pdf.Name n)::r) t
| h::t -> parse_R_inner (Lexeme h::r) t
in
parse_R_inner [] ts
let process_parse_dictionary elts =
let rec mkpairs pairs = function
| [] -> pairs
| Parsed v::Parsed (Pdf.Name k)::t -> mkpairs ((k, v)::pairs) t
| _ -> raise (Pdf.PDFError "parse_dictionary")
in
try
Parsed (Pdf.Dictionary (mkpairs [] elts))
with
Pdf.PDFError "parse_dictionary" ->
Pdfe.log
"Malformed file: odd length dictionary. Carrying on...\n";
Parsed (Pdf.Dictionary [])
let process_parse_array elts =
let arry =
rev_map
(function
| Parsed x -> x
| _ -> raise (Pdf.PDFError "parse_array"))
elts
in
Parsed (Pdf.Array arry)
(* Read everything to the close of the dictionary *)
let rec parse_dictionary sofar = function
| [] ->
process_parse_dictionary sofar, []
| Lexeme LexLeftDict::t ->
let dict, rest = parse_dictionary [] t in
parse_dictionary (dict::sofar) rest
| Lexeme LexLeftSquare::t ->
let arr, rest = parse_array [] t in
parse_dictionary (arr::sofar) rest
| Lexeme LexRightDict::t ->
process_parse_dictionary sofar, t
| h::t ->
parse_dictionary (h::sofar) t
(* Read everything to the close of the array *)
and parse_array sofar = function
| [] ->
process_parse_array sofar, []
| Lexeme LexLeftDict::t ->
let dict, rest = parse_dictionary [] t in
parse_array (dict::sofar) rest
| Lexeme LexLeftSquare::t ->
let arr, rest = parse_array [] t in
parse_array (arr::sofar) rest
| Lexeme LexRightSquare::t ->
process_parse_array sofar, t
| h::t ->
parse_array (h::sofar) t
(* Main function *)
let rec parse_to_tree sofar = function
| Lexeme LexLeftDict::t ->
let dict, rest = parse_dictionary [] t in
parse_to_tree (dict::sofar) rest
| Lexeme LexLeftSquare::t ->
let arr, rest = parse_array [] t in
parse_to_tree (arr::sofar) rest
| h::t -> parse_to_tree (h::sofar) t
| [] -> rev sofar
let parse_finish_read_stream o d obj s =
(* Fix up length, if necessary *)
let l =
match s with Pdf.Got b -> bytes_size b | Pdf.ToGet t -> Pdf.length_of_toget t
and lold =
try
begin match
lookup_failnull "/Length" d with Pdf.Integer l -> l | _ -> -1
end
with
Not_found -> -1
in
if lold <> l
then
(o,
Pdf.Stream
{contents = Pdf.add_dict_entry obj "/Length" (Pdf.Integer l), s})
else
(o, Pdf.Stream {contents = obj, s})
let parse_finish ?(failure_is_ok = false) q =
match q with
| [Parsed (Pdf.Integer o); Parsed (Pdf.Integer _);
Lexeme LexObj; Parsed obj; Lexeme LexEndObj] ->
o, obj
| Parsed (Pdf.Integer o)::
Parsed (Pdf.Integer _)::
Lexeme LexObj::
Parsed (Pdf.Dictionary d as obj)::
Lexeme (LexStream s)::
Lexeme LexEndStream::_ ->
parse_finish_read_stream o d obj s
| Parsed (Pdf.Dictionary d as obj)::
Lexeme (LexStream s)::
Lexeme LexEndStream::_ ->
(* Malformed file: stream object in object stream! *)
parse_finish_read_stream 0 d obj s
| Parsed (Pdf.Integer o)::Parsed (Pdf.Integer _)::
Lexeme LexObj::Parsed obj::_ ->
o, obj
| [Parsed d] ->
0, d
| [Parsed (Pdf.Integer o); Parsed (Pdf.Integer _);
Lexeme LexObj; Lexeme LexEndObj] ->
o, Pdf.Null
| l ->
Pdfe.log "Unable to parse object:\n";
iter
(function
Parsed p -> Pdfe.log (Printf.sprintf "%S\n" (Pdfwrite.string_of_pdf p))
| Lexeme l -> Pdfe.log (Printf.sprintf "%S\n" (string_of_lexeme l)))
l;
raise (Pdf.PDFError "Could not extract object")
(* Parse some lexemes *)
let parse ?(failure_is_ok = false) lexemes =
try
parse_finish
~failure_is_ok:failure_is_ok (parse_to_tree [] (parse_R lexemes))
with
Pdf.PDFError _ as e ->
if failure_is_ok then (max_int, Pdf.Null) else raise e
let parse_single_object s =
snd (parse (lex_object_at true (Pdfio.input_of_string s) true (fun _ -> (0, Pdf.Null)) (fun _ -> [])))
(* Given an object stream pdfobject and a list of object indexes to extract,
return an [(int * Pdf.objectdata) list] representing those object number,
lexeme pairs - assuming they exist! *)
let lex_stream_object
i xrefs parse opt obj indexes user_pw owner_pw partial_pdf gen
=
if !read_debug then
begin
Pdfe.log (Printf.sprintf "lexing object stream at %i\n" (i.Pdfio.pos_in ()));
Pdfe.log (Printf.sprintf "lexing object stream %i\nTo find the indexes:\n" obj);
iter (fun i -> Pdfe.log (Printf.sprintf "%i " i)) indexes; Pdfe.log "\n"
end;
let _, stmobj = parse (lex_object i xrefs parse opt obj) in
match stmobj with
| Pdf.Stream {contents = d, stream} ->
let n =
match Pdf.lookup_direct partial_pdf "/N" d with
| Some (Pdf.Integer n) -> n
| _ ->
raise (Pdf.PDFError (Pdf.input_pdferror i "malformed /N"))
in let first =
match Pdf.lookup_direct partial_pdf "/First" d with
| Some (Pdf.Integer n) -> n
| _ ->
raise (Pdf.PDFError (Pdf.input_pdferror i "malformed /First"))
in
(* Decrypt if necessary *)
let stmobj =
Pdfcrypt.decrypt_single_stream
user_pw owner_pw partial_pdf obj gen stmobj
in
Pdfcodec.decode_pdfstream partial_pdf stmobj;
begin match stmobj with
| Pdf.Stream {contents = _, Pdf.Got raw} ->
let i = input_of_bytes raw in
begin try
(* Read index. *)
let rawnums = ref [] in
for x = 1 to n * 2 do
dropwhite i;
rawnums =|
match lex_number i with
| LexInt i -> i
| k ->
raise
(Pdf.PDFError
(Pdf.input_pdferror i "objstm offset problem"))
done;
rawnums := rev !rawnums;
(* Read each object *)
let pairs = pairs_of_list !rawnums
and objects = ref []
and index = ref 0
and indexes = ref (sort compare_i indexes) in
iter
(fun (objnum, offset) ->
begin match !indexes with
| [] -> ()
| x::xs when x <> !index -> ()
| _::xs ->
indexes := xs;
i.seek_in (offset + first);
let lexemes =
lex_object_at
true i opt parse
(lex_object i xrefs parse opt)
in
let obj =
(ref
(Pdf.ParsedAlreadyDecrypted
(snd (parse lexemes))), 0)
in
objects =| (objnum, obj)
end;
incr index)
pairs;
Pdf.addobj_given_num partial_pdf (obj, Pdf.Null);
!objects
with
End_of_file ->
raise
(Pdf.PDFError