File: pdftohtml.cc

package info (click to toggle)
poppler 26.01.0-2
links: PTS, VCS
area: main
in suites: experimental
size: 18,984 kB
sloc: cpp: 166,738; ansic: 34,768; python: 367; sh: 82; makefile: 38
file content (457 lines) | stat: -rw-r--r-- 17,883 bytes
//========================================================================
//
// pdftohtml.cc
//
//
// Copyright 1999-2000 G. Ovtcharov
//========================================================================

//========================================================================
//
// Modified under the Poppler project - http://poppler.freedesktop.org
//
// All changes made under the Poppler project to this file are licensed
// under GPL version 2 or later
//
// Copyright (C) 2007-2008, 2010, 2012, 2015-2020, 2022, 2024, 2025 Albert Astals Cid <aacid@kde.org>
// Copyright (C) 2010 Hib Eris <hib@hiberis.nl>
// Copyright (C) 2010 Mike Slegeir <tehpola@yahoo.com>
// Copyright (C) 2010, 2013 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
// Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac@cdacmumbai.in) and Onkar Potdar (onkar@cdacmumbai.in)
// Copyright (C) 2011 Steven Murdoch <Steven.Murdoch@cl.cam.ac.uk>
// Copyright (C) 2012 Igor Slepchin <igor.redhat@gmail.com>
// Copyright (C) 2012 Ihar Filipau <thephilips@gmail.com>
// Copyright (C) 2012 Luis Parravicini <lparravi@gmail.com>
// Copyright (C) 2014 Pino Toscano <pino@kde.org>
// Copyright (C) 2015 William Bader <williambader@hotmail.com>
// Copyright (C) 2017, 2021 Adrian Johnson <ajohnson@redneon.com>
// Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
// Copyright (C) 2018 Thibaut Brard <thibaut.brard@gmail.com>
// Copyright (C) 2018 Adam Reichold <adam.reichold@t-online.de>
// Copyright (C) 2019, 2021, 2024 Oliver Sander <oliver.sander@tu-dresden.de>
// Copyright (C) 2021 Hubert Figuiere <hub@figuiere.net>
// Copyright (C) 2024, 2025 g10 Code GmbH, Author: Sune Stolborg Vuorela <sune@vuorela.dk>
//
// To see a description of the changes please see the Changelog file that
// came with your tarball or type make ChangeLog if you are building from git
//
//========================================================================

#include "config.h"
#include <poppler-config.h>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include "parseargs.h"
#include "goo/GooString.h"
#include "goo/gbase64.h"
#include "goo/gbasename.h"
#include "Object.h"
#include "Stream.h"
#include "Dict.h"
#include "XRef.h"
#include "Catalog.h"
#include "Page.h"
#include "Outline.h"
#include "PDFDoc.h"
#include "PDFDocFactory.h"
#include "HtmlOutputDev.h"
#include "SplashOutputDev.h"
#include "splash/SplashBitmap.h"
#include "GlobalParams.h"
#include "PDFDocEncoding.h"
#include "Error.h"
#include "DateInfo.h"
#include "goo/gfile.h"
#include "Win32Console.h"
#include "InMemoryFile.h"
#include "UTF.h"

static int firstPage = 1;
static int lastPage = 0;
static bool rawOrder = true;
bool printCommands = true;
static bool printHelp = false;
bool printHtml = false;
bool complexMode = false;
bool singleHtml = false; // singleHtml
bool dataUrls = false;
bool ignore = false;
static char extension[5] = "png";
static double scale = 1.5;
bool noframes = false;
bool stout = false;
bool xml = false;
bool noRoundedCoordinates = false;
static bool errQuiet = false;
static bool noDrm = false;
double wordBreakThreshold = 10; // 10%, below converted into a coefficient - 0.1

bool showHidden = false;
bool noMerge = false;
bool fontFullName = false;
static char ownerPassword[33] = "";
static char userPassword[33] = "";
static bool printVersion = false;

static std::unique_ptr<GooString> getInfoString(Dict *infoDict, const char *key);
static std::optional<std::string> getInfoDate(Dict *infoDict, const char *key);

static char textEncName[128] = "";

static const ArgDesc argDesc[] = { { "-f", argInt, &firstPage, 0, "first page to convert" },
                                   { "-l", argInt, &lastPage, 0, "last page to convert" },
                                   /*{"-raw",    argFlag,     &rawOrder,      0,
                                     "keep strings in content stream order"},*/
                                   { "-q", argFlag, &errQuiet, 0, "don't print any messages or errors" },
                                   { "-h", argFlag, &printHelp, 0, "print usage information" },
                                   { "-?", argFlag, &printHelp, 0, "print usage information" },
                                   { "-help", argFlag, &printHelp, 0, "print usage information" },
                                   { "--help", argFlag, &printHelp, 0, "print usage information" },
                                   { "-p", argFlag, &printHtml, 0, "exchange .pdf links by .html" },
                                   { "-c", argFlag, &complexMode, 0, "generate complex document" },
                                   { "-s", argFlag, &singleHtml, 0, "generate single document that includes all pages" },
#ifdef HAVE_IN_MEMORY_FILE
                                   { "-dataurls", argFlag, &dataUrls, 0, "use data URLs instead of external images in HTML" },
#endif
                                   { "-i", argFlag, &ignore, 0, "ignore images" },
                                   { "-noframes", argFlag, &noframes, 0, "generate no frames" },
                                   { "-stdout", argFlag, &stout, 0, "use standard output" },
                                   { "-zoom", argFP, &scale, 0, "zoom the pdf document (default 1.5)" },
                                   { "-xml", argFlag, &xml, 0, "output for XML post-processing" },
                                   { "-noroundcoord", argFlag, &noRoundedCoordinates, 0, "do not round coordinates (with XML output only)" },
                                   { "-hidden", argFlag, &showHidden, 0, "output hidden text" },
                                   { "-nomerge", argFlag, &noMerge, 0, "do not merge paragraphs" },
                                   { "-enc", argString, textEncName, sizeof(textEncName), "output text encoding name" },
                                   { "-fmt", argString, extension, sizeof(extension), "image file format for Splash output (png or jpg)" },
                                   { "-v", argFlag, &printVersion, 0, "print copyright and version info" },
                                   { "-opw", argString, ownerPassword, sizeof(ownerPassword), "owner password (for encrypted files)" },
                                   { "-upw", argString, userPassword, sizeof(userPassword), "user password (for encrypted files)" },
                                   { "-nodrm", argFlag, &noDrm, 0, "override document DRM settings" },
                                   { "-wbt", argFP, &wordBreakThreshold, 0, "word break threshold (default 10 percent)" },
                                   { "-fontfullname", argFlag, &fontFullName, 0, "outputs font full name" },
                                   {} };

class SplashOutputDevNoText : public SplashOutputDev
{
public:
    SplashOutputDevNoText(SplashColorMode colorModeA, int bitmapRowPadA, bool reverseVideoA, SplashColorPtr paperColorA, bool bitmapTopDownA = true)
        : SplashOutputDev(colorModeA, bitmapRowPadA, reverseVideoA, paperColorA, bitmapTopDownA) { }
    ~SplashOutputDevNoText() override;

    void drawChar(GfxState * /*state*/, double /*x*/, double /*y*/, double /*dx*/, double /*dy*/, double /*originX*/, double /*originY*/, CharCode /*code*/, int /*nBytes*/, const Unicode * /*u*/, int /*uLen*/) override { }
    bool beginType3Char(GfxState * /*state*/, double /*x*/, double /*y*/, double /*dx*/, double /*dy*/, CharCode /*code*/, const Unicode * /*u*/, int /*uLen*/) override { return false; }
    void endType3Char(GfxState * /*state*/) override { }
    void beginTextObject(GfxState * /*state*/) override { }
    void endTextObject(GfxState * /*state*/) override { }
    bool interpretType3Chars() override { return false; }
};

SplashOutputDevNoText::~SplashOutputDevNoText() = default;

int main(int argc, char *argv[])
{
    std::unique_ptr<PDFDoc> doc;
    GooString *fileName = nullptr;
    std::unique_ptr<GooString> docTitle;
    std::unique_ptr<GooString> author;
    std::unique_ptr<GooString> keywords;
    std::unique_ptr<GooString> subject;
    std::optional<std::string> date;
    std::unique_ptr<GooString> htmlFileName;
    HtmlOutputDev *htmlOut = nullptr;
    SplashOutputDev *splashOut = nullptr;
    bool doOutline;
    bool ok;
    std::optional<GooString> ownerPW, userPW;
    Object info;
    int exit_status = EXIT_FAILURE;

    Win32Console win32Console(&argc, &argv);
    // parse args
    ok = parseArgs(argDesc, &argc, argv);
    if (!ok || argc < 2 || argc > 3 || printHelp || printVersion) {
        fprintf(stderr, "pdftohtml version %s\n", PACKAGE_VERSION);
        fprintf(stderr, "%s\n", popplerCopyright);
        fprintf(stderr, "%s\n", "Copyright 1999-2003 Gueorgui Ovtcharov and Rainer Dorsch");
        fprintf(stderr, "%s\n\n", xpdfCopyright);
        if (!printVersion) {
            printUsage("pdftohtml", "<PDF-file> [<html-file> <xml-file>]", argDesc);
        }
        exit(printHelp || printVersion ? 0 : 1);
    }

    // init error file
    // errorInit();

    // read config file
    globalParams = std::make_unique<GlobalParams>();

    if (errQuiet) {
        globalParams->setErrQuiet(errQuiet);
        printCommands = false; // I'm not 100% what is the difference between them
    }

    if (textEncName[0]) {
        globalParams->setTextEncoding(textEncName);
        if (!globalParams->getTextEncoding()) {
            goto error;
        }
    }

    // convert from user-friendly percents into a coefficient
    wordBreakThreshold /= 100.0;

    // open PDF file
    if (ownerPassword[0]) {
        ownerPW = GooString(ownerPassword);
    }
    if (userPassword[0]) {
        userPW = GooString(userPassword);
    }

    fileName = new GooString(argv[1]);

    if (fileName->cmp("-") == 0) {
        delete fileName;
        fileName = new GooString("fd://0");
    }

    doc = PDFDocFactory().createPDFDoc(*fileName, ownerPW, userPW);

    if (!doc->isOk()) {
        goto error;
    }

    // check for copy permission
    if (!doc->okToCopy()) {
        if (!noDrm) {
            error(errNotAllowed, -1, "Copying of text from this document is not allowed.");
            goto error;
        }
        fprintf(stderr, "Document has copy-protection bit set.\n");
    }

    // construct text file name
    if (argc == 3) {
        auto tmp = std::make_unique<GooString>(argv[2]);
        if (!xml) {
            if (tmp->size() >= 5) {
                const char *p = tmp->c_str() + tmp->size() - 5;
                if (!strcmp(p, ".html") || !strcmp(p, ".HTML")) {
                    htmlFileName = std::make_unique<GooString>(tmp->c_str(), tmp->size() - 5);
                }
            }
        } else {
            if (tmp->size() >= 4) {
                const char *p = tmp->c_str() + tmp->size() - 4;
                if (!strcmp(p, ".xml") || !strcmp(p, ".XML")) {
                    htmlFileName = std::make_unique<GooString>(tmp->c_str(), tmp->size() - 4);
                }
            }
        }
        if (!htmlFileName) {
            htmlFileName = std::move(tmp);
        }
    } else if (fileName->cmp("fd://0") == 0) {
        error(errCommandLine, -1, "You have to provide an output filename when reading from stdin.");
        goto error;
    } else {
        const char *p = fileName->c_str() + fileName->size() - 4;
        if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF")) {
            htmlFileName = std::make_unique<GooString>(fileName->c_str(), fileName->size() - 4);
        } else {
            htmlFileName = fileName->copy();
        }
        //   htmlFileName->append(".html");
    }

    if (scale > 3.0) {
        scale = 3.0;
    }
    if (scale < 0.5) {
        scale = 0.5;
    }

    if (complexMode) {
        // noframes=false;
        stout = false;
    }

    if (stout) {
        noframes = true;
        complexMode = false;
    }

    if (xml) {
        complexMode = true;
        singleHtml = false;
        noframes = true;
        noMerge = true;
    }

    // get page range
    if (firstPage < 1) {
        firstPage = 1;
    }
    if (lastPage < 1 || lastPage > doc->getNumPages()) {
        lastPage = doc->getNumPages();
    }
    if (lastPage < firstPage) {
        error(errCommandLine, -1, "Wrong page range given: the first page ({0:d}) can not be after the last page ({1:d}).", firstPage, lastPage);
        goto error;
    }

    info = doc->getDocInfo();
    if (info.isDict()) {
        docTitle = getInfoString(info.getDict(), "Title");
        author = getInfoString(info.getDict(), "Author");
        keywords = getInfoString(info.getDict(), "Keywords");
        subject = getInfoString(info.getDict(), "Subject");
        date = getInfoDate(info.getDict(), "ModDate");
        if (!date) {
            date = getInfoDate(info.getDict(), "CreationDate");
        }
    }
    if (!docTitle) {
        docTitle = htmlFileName->copy();
    }

    if (!singleHtml) {
        rawOrder = complexMode; // todo: figure out what exactly rawOrder do :)
    } else {
        rawOrder = singleHtml;
    }

    doOutline = doc->getOutline()->getItems() != nullptr;
    // write text file
    htmlOut = new HtmlOutputDev(doc->getCatalog(), htmlFileName->c_str(), docTitle->c_str(), author ? author->c_str() : nullptr, keywords ? keywords->c_str() : nullptr, subject ? subject->c_str() : nullptr, date ? date->c_str() : nullptr,
                                rawOrder, firstPage, doOutline);

    if ((complexMode || singleHtml) && !xml && !ignore) {
        // White paper color
        SplashColor color;
        color[0] = color[1] = color[2] = 255;
        // If the user specified "jpg" use JPEG, otherwise PNG
        SplashImageFileFormat format = strcmp(extension, "jpg") ? splashFormatPng : splashFormatJpeg;

        splashOut = new SplashOutputDevNoText(splashModeRGB8, 4, false, color);
        splashOut->startDoc(doc.get());

        for (int pg = firstPage; pg <= lastPage; ++pg) {
            InMemoryFile imf;
            doc->displayPage(splashOut, pg, 72 * scale, 72 * scale, 0, true, false, false);
            SplashBitmap *bitmap = splashOut->getBitmap();

            const std::string imgFileName = GooString::format("{0:s}{1:03d}.{2:s}", htmlFileName->c_str(), pg, extension);
            auto f1 = dataUrls ? imf.open("wb") : fopen(imgFileName.c_str(), "wb");
            if (!f1) {
                fprintf(stderr, "Could not open %s\n", imgFileName.c_str());
                continue;
            }
            bitmap->writeImgFile(format, f1, 72 * scale, 72 * scale);
            fclose(f1);
            if (dataUrls) {
                htmlOut->addBackgroundImage(std::string((format == splashFormatJpeg) ? "data:image/jpeg;base64," : "data:image/png;base64,") + gbase64Encode(imf.getBuffer()));
            } else {
                htmlOut->addBackgroundImage(gbasename(imgFileName.c_str()));
            }
        }

        delete splashOut;
    }

    if (htmlOut->isOk()) {
        doc->displayPages(htmlOut, firstPage, lastPage, 72 * scale, 72 * scale, 0, true, false, false);
        htmlOut->dumpDocOutline(doc.get());
    }

    delete htmlOut;

    exit_status = EXIT_SUCCESS;

    // clean up
error:
    delete fileName;

    return exit_status;
}

static std::unique_ptr<GooString> getInfoString(Dict *infoDict, const char *key)
{
    Object obj;
    // Raw value as read from PDF (may be in pdfDocEncoding or UCS2)
    const GooString *rawString;
    // Value converted to unicode
    Unicode *unicodeString;
    int unicodeLength;
    // Value HTML escaped and converted to desired encoding
    std::unique_ptr<GooString> encodedString;
    // Is rawString UCS2 (as opposed to pdfDocEncoding)
    bool isUnicode;

    obj = infoDict->lookup(key);
    if (obj.isString()) {
        rawString = obj.getString();

        // Convert rawString to unicode
        if (hasUnicodeByteOrderMark(rawString->toStr())) {
            isUnicode = true;
            unicodeLength = (obj.getString()->size() - 2) / 2;
        } else {
            isUnicode = false;
            unicodeLength = obj.getString()->size();
        }
        unicodeString = new Unicode[unicodeLength];

        for (int i = 0; i < unicodeLength; i++) {
            if (isUnicode) {
                unicodeString[i] = ((rawString->getChar((i + 1) * 2) & 0xff) << 8) | (rawString->getChar(((i + 1) * 2) + 1) & 0xff);
            } else {
                unicodeString[i] = pdfDocEncoding[rawString->getChar(i) & 0xff];
            }
        }

        // HTML escape and encode unicode
        encodedString = HtmlFont::HtmlFilter(unicodeString, unicodeLength);
        delete[] unicodeString;
    }

    return encodedString;
}

static std::optional<std::string> getInfoDate(Dict *infoDict, const char *key)
{
    Object obj;
    int year, mon, day, hour, min, sec, tz_hour, tz_minute;
    char tz;
    struct tm tmStruct;
    char buf[256];

    obj = infoDict->lookup(key);
    if (obj.isString()) {
        const GooString *s = obj.getString();
        // TODO do something with the timezone info
        if (parseDateString(s, &year, &mon, &day, &hour, &min, &sec, &tz, &tz_hour, &tz_minute)) {
            tmStruct.tm_year = year - 1900;
            tmStruct.tm_mon = mon - 1;
            tmStruct.tm_mday = day;
            tmStruct.tm_hour = hour;
            tmStruct.tm_min = min;
            tmStruct.tm_sec = sec;
            tmStruct.tm_wday = -1;
            tmStruct.tm_yday = -1;
            tmStruct.tm_isdst = -1;
            mktime(&tmStruct); // compute the tm_wday and tm_yday fields
            if (strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S+00:00", &tmStruct)) {
                return std::string(buf);
            } else {
                return s->toStr();
            }
        } else {
            return s->toStr();
        }
    }
    return {};
}