1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
|
(** Parsing fonts and extracting text from content streams and PDF strings *)
(** {2 Data Types } *)
type type3_glpyhs =
{fontbbox : float * float * float * float;
fontmatrix : Pdftransform.transform_matrix;
charprocs : (string * Pdf.pdfobject) list;
type3_resources : Pdf.pdfobject}
type simple_fonttype =
| Type1
| MMType1
| Type3 of type3_glpyhs
| Truetype
type fontfile =
| FontFile of int
| FontFile2 of int
| FontFile3 of int
type fontdescriptor =
{ascent : float;
descent : float;
avgwidth : float;
maxwidth : float;
flags : int;
fontbbox: float * float * float * float;
italicangle : float;
capheight : float;
xheight : float;
stemv : float;
fontfile : fontfile option;
charset : string list option;
tounicode : (int, string) Hashtbl.t option}
type differences = (string * int) list
type encoding =
| ImplicitInFontFile
| StandardEncoding
| MacRomanEncoding
| WinAnsiEncoding
| MacExpertEncoding
| CustomEncoding of encoding * differences
| FillUndefinedWithStandard of encoding
type fontmetrics = float array (*r widths of glyphs 0..255 *)
type simple_font =
{fonttype : simple_fonttype;
basefont : string;
firstchar : int;
lastchar : int;
widths : int array;
fontdescriptor : fontdescriptor option;
fontmetrics : fontmetrics option;
encoding : encoding}
type standard_font =
| TimesRoman
| TimesBold
| TimesItalic
| TimesBoldItalic
| Helvetica
| HelveticaBold
| HelveticaOblique
| HelveticaBoldOblique
| Courier
| CourierBold
| CourierOblique
| CourierBoldOblique
| Symbol
| ZapfDingbats
type cid_system_info =
{registry : string;
ordering : string;
supplement : int}
type composite_CIDfont =
{cid_system_info : cid_system_info;
cid_basefont : string;
cid_fontdescriptor : fontdescriptor;
cid_widths : (int * float) list;
cid_default_width : int}
type cmap_encoding =
| Predefined of string
| CMap of int (* indirect reference to CMap stream *)
type font =
| StandardFont of standard_font * encoding
| SimpleFont of simple_font
| CIDKeyedFont of string * composite_CIDfont * cmap_encoding
(** {2 String representations of fonts } *)
(** Returns a string such as "Times-Bold" for Pdftext.TimesBold etc. *)
val string_of_standard_font : standard_font -> string
(** Parses a string such as "/Times-Bold" or "/TimesNewRoman,Bold" to Pdftext.TimesRomanBold etc. *)
val standard_font_of_name : string -> standard_font option
(** A debug string for the whole font datatype. *)
val string_of_font : font -> string
(** {2 Reading a Font} *)
(** Read a font from a given document and object *)
val read_font : Pdf.t -> Pdf.pdfobject -> font
(** {2 Writing a Font} *)
(** Write a font to a given document, returning the object number for the main
font dictionary *)
val write_font : ?objnum:int -> Pdf.t -> font -> int
(** {2 Utility functions} *)
(** Is a PDF string UTF16be (i.e does it have a byte order marker at the beginning)? *)
val is_unicode : string -> bool
(** Is a font Identity H? *)
val is_identity_h : font -> bool
(** A list of unicode codepoints for a UTF8 string *)
val codepoints_of_utf8 : string -> int list
(** A UTF8 string for a list of unicode codepoints *)
val utf8_of_codepoints : int list -> string
(** A list of unicode codepoints for a UTF16BE string *)
val codepoints_of_utf16be : string -> int list
(** A UTF16BE string for a list of unicode codepoints (with BOM) *)
val utf16be_of_codepoints : int list -> string
(** {2 Text from strings outside page content} *)
(** Take a pdf string (which will be either pdfdocencoding or UTF16BE) and
return a string representing the same unicode codepoints in UTF8 *)
val utf8_of_pdfdocstring : string -> string
(** Take a UTF8 string and convert to pdfdocencoding (if no unicode-only
characters are used) or UTF16BE (if they are)) *)
val pdfdocstring_of_utf8 : string -> string
(** Build a pdf string in pdfdocencoding (if no unicode-only characters are
used) or UTF16BE (if they are) *)
val pdfdocstring_of_codepoints : int list -> string
(** Produce a list of unicode codepoints from a pdfdocencoding or UTF16BE pdf
document string *)
val codepoints_of_pdfdocstring : string -> int list
(** Remake a UTF16BE string into a PDFDocEncoding string if all characters are
in PDFDocEncoding *)
val simplify_utf16be : string -> string
(** {2 Text from strings inside page content} *)
(** The type of text extractors. *)
type text_extractor
(** Build a text extractor from a document and font object *)
val text_extractor_of_font : Pdf.t -> Pdf.pdfobject -> text_extractor
(** Build a text extractor from a document and a font *)
val text_extractor_of_font_real : font -> text_extractor
(** Return a list of unicode points from a given extractor and string (for
example from a [Pdfpages.Op_Tj] or [Op_TJ] operator). *)
val codepoints_of_text : text_extractor -> string -> int list
(** Return a list of glyph names from a given extractor and string *)
val glyphnames_of_text : text_extractor -> string -> string list
(** {2 Building text for strings inside page content} *)
(** Return the character code for a given unicode codepoint, if it exists in
the encoding and font object. If [debug] is set (default false) missing
characters are reported to stderr. *)
val charcode_extractor_of_font : ?debug:bool -> Pdf.t -> Pdf.pdfobject -> (int -> int option)
(** Return the character code for a given unicode codepoint, if it exists in
the encoding and font. If [debug] is set (default false) missing characters are
reported to stderr. *)
val charcode_extractor_of_font_real : ?debug:bool -> font -> (int -> int option)
(** Table of all the entries in an encoding. *)
val table_of_encoding : encoding -> (int, string) Hashtbl.t
(** Reverse table of all the entries in an encoding. *)
val reverse_table_of_encoding : encoding -> (string, int) Hashtbl.t
|