1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
|
.. testsetup::
import sys
from pdfminer.high_level import extract_text_to_fp, extract_text
.. _tutorial_highlevel:
Extract text from a PDF using Python
************************************
The high-level API can be used to do common tasks.
The most simple way to extract text from a PDF is to use
:ref:`api_extract_text`:
.. doctest::
>>> text = extract_text('samples/simple1.pdf')
>>> print(repr(text))
'Hello \n\nWorld\n\nHello \n\nWorld\n\nH e l l o \n\nW o r l d\n\nH e l l o \n\nW o r l d\n\n\x0c'
>>> print(text)
... # doctest: +NORMALIZE_WHITESPACE
Hello
<BLANKLINE>
World
<BLANKLINE>
Hello
<BLANKLINE>
World
<BLANKLINE>
H e l l o
<BLANKLINE>
W o r l d
<BLANKLINE>
H e l l o
<BLANKLINE>
W o r l d
<BLANKLINE>
To read text from a PDF and print it on the command line:
.. doctest::
>>> if sys.version_info > (3, 0):
... from io import StringIO
... else:
... from io import BytesIO as StringIO
>>> output_string = StringIO()
>>> with open('samples/simple1.pdf', 'rb') as fin:
... extract_text_to_fp(fin, output_string)
>>> print(output_string.getvalue().strip())
Hello WorldHello WorldHello WorldHello World
Or to convert it to html and use layout analysis:
.. doctest::
>>> if sys.version_info > (3, 0):
... from io import StringIO
... else:
... from io import BytesIO as StringIO
>>> from pdfminer.layout import LAParams
>>> output_string = StringIO()
>>> with open('samples/simple1.pdf', 'rb') as fin:
... extract_text_to_fp(fin, output_string, laparams=LAParams(),
... output_type='html', codec=None)
|