File: pdftext.mli

package info (click to toggle)
camlpdf 0.5-1
  • links: PTS, VCS
  • area: non-free
  • in suites: squeeze, wheezy
  • size: 1,516 kB
  • ctags: 2,689
  • sloc: ml: 18,229; ansic: 139; makefile: 139
file content (128 lines) | stat: -rw-r--r-- 3,359 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
(** Parsing Fonts and Extracting Text - Text extraction incomplete. *)

(** {2 Data Types } *)

type type3_glpyhs =
  {fontbbox : float * float * float * float;
   fontmatrix : Transform.transform_matrix;
   charprocs : (string * Pdf.pdfobject) list;
   type3_resources : Pdf.pdfobject}

type simple_fonttype =
  | Type1
  | MMType1
  | Type3 of type3_glpyhs
  | Truetype

type fontmetrics = float array

type fontfile =
  | FontFile of int
  | FontFile2 of int
  | FontFile3 of int

type fontdescriptor =
  {ascent : float;
   descent : float;
   leading : float;
   avgwidth : float;
   maxwidth : float;
   fontfile : fontfile option}

type differences = (string * int) list

type encoding =
  | ImplicitInFontFile
  | StandardEncoding
  | MacRomanEncoding
  | WinAnsiEncoding
  | MacExpertEncoding
  | CustomEncoding of encoding * differences
  | FillUndefinedWithStandard of encoding

type simple_font =
  {fonttype : simple_fonttype;
   basefont : string;
   fontmetrics : fontmetrics option;
   fontdescriptor : fontdescriptor option;
   encoding : encoding}

type standard_font =
  | TimesRoman
  | TimesBold
  | TimesItalic
  | TimesBoldItalic
  | Helvetica
  | HelveticaBold
  | HelveticaOblique
  | HelveticaBoldOblique
  | Courier
  | CourierBold
  | CourierOblique
  | CourierBoldOblique
  | Symbol
  | ZapfDingbats

val string_of_standard_font : standard_font -> string

val standard_font_of_name : string -> standard_font option

type cid_system_info =
  {registry : string;
   ordering : string;
   supplement : int}

type composite_CIDfont =
  {cid_system_info : cid_system_info;
   cid_basefont : string;
   cid_fontdescriptor : fontdescriptor;
   cid_widths : (int * float) list;
   cid_default_width : int}
  
type cmap_encoding =
  | Predefined of string
  | CMap of int (* indirect reference to CMap stream *)

type font =
  | StandardFont of standard_font * encoding
  | SimpleFont of simple_font
  | CIDKeyedFont of string * composite_CIDfont * cmap_encoding

(** {2 Reading a Font} *)

(** Read a font from a given document and object *)
val read_font : Pdf.pdfdoc -> Pdf.pdfobject -> font

(** {2 Text Extraction} *)

(** The type of text extractors. *)
type text_extractor

(** Build a text extractor from a document and font object *)
val text_extractor_of_font : Pdf.pdfdoc -> Pdf.pdfobject -> text_extractor

(** Return a list of unicode points from a given extractor and string (for
example from a [Pdfpages.Op_Tj] or [Op_TJ] operator). *)
val codepoints_of_text : text_extractor -> string -> int list

(** Same, but return UTF16BE *)
val utf16be_of_text : text_extractor -> string -> string

(** Return UTF16BE from a list of codepoints. *)
val utf16be_of_codepoints : int list -> string

(** Same, but return Latin1 (Lossy) *)
val latin1_string_of_text : text_extractor -> string -> string

(** Decode a single character code in a standard font *)
val decode_char : encoding -> char -> char

(** Decode a single character code in a type3 font to a glyph name *)
val decode_type3_char : encoding -> char -> string

(** Read the codepoints in a PDF text string, unicode or PDFDocEncoding. *)
val codepoints_of_textstring : string -> int list

(** Debug string *)
val string_of_font : font -> string