summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJohannes Schauer <josch@debian.org>2015-07-27 16:07:02 +0200
committerJohannes Schauer <josch@debian.org>2015-07-27 16:07:02 +0200
commit385b4eca34c290f112d90e74925ba1963a4e0a94 (patch)
tree5b23566049318adbdd0d26c82735fa9b4072aae5 /src
Import pdf2htmlex_0.14.6+ds.orig.tar.gz
[dgit import orig pdf2htmlex_0.14.6+ds.orig.tar.gz]
Diffstat (limited to 'src')
-rw-r--r--src/ArgParser.cc176
-rw-r--r--src/ArgParser.h219
-rw-r--r--src/BackgroundRenderer/BackgroundRenderer.cc130
-rw-r--r--src/BackgroundRenderer/BackgroundRenderer.h52
-rw-r--r--src/BackgroundRenderer/CairoBackgroundRenderer.cc311
-rw-r--r--src/BackgroundRenderer/CairoBackgroundRenderer.h75
-rw-r--r--src/BackgroundRenderer/SplashBackgroundRenderer.cc261
-rw-r--r--src/BackgroundRenderer/SplashBackgroundRenderer.h65
-rw-r--r--src/Base64Stream.cc42
-rw-r--r--src/Base64Stream.h34
-rw-r--r--src/Color.cc51
-rw-r--r--src/Color.h38
-rw-r--r--src/CoveredTextDetector.cc51
-rw-r--r--src/CoveredTextDetector.h61
-rw-r--r--src/DrawingTracer.cc400
-rw-r--r--src/DrawingTracer.h79
-rw-r--r--src/HTMLRenderer/HTMLRenderer.h348
-rw-r--r--src/HTMLRenderer/draw.cc65
-rw-r--r--src/HTMLRenderer/font.cc1089
-rw-r--r--src/HTMLRenderer/form.cc76
-rw-r--r--src/HTMLRenderer/general.cc592
-rw-r--r--src/HTMLRenderer/image.cc83
-rw-r--r--src/HTMLRenderer/link.cc309
-rw-r--r--src/HTMLRenderer/outline.cc74
-rw-r--r--src/HTMLRenderer/state.cc541
-rw-r--r--src/HTMLRenderer/text.cc166
-rw-r--r--src/HTMLState.h82
-rw-r--r--src/HTMLTextLine.cc734
-rw-r--r--src/HTMLTextLine.h134
-rw-r--r--src/HTMLTextPage.cc147
-rw-r--r--src/HTMLTextPage.h66
-rw-r--r--src/Param.h87
-rw-r--r--src/Preprocessor.cc107
-rw-r--r--src/Preprocessor.h66
-rw-r--r--src/StateManager.h430
-rw-r--r--src/StringFormatter.cc30
-rw-r--r--src/StringFormatter.h43
-rw-r--r--src/TmpFiles.cc77
-rw-r--r--src/TmpFiles.h28
-rw-r--r--src/css_class_names.cmakelists.txt39
-rw-r--r--src/pdf2htmlEX-config.h.in24
-rw-r--r--src/pdf2htmlEX.cc445
-rw-r--r--src/util/const.cc53
-rw-r--r--src/util/const.h46
-rw-r--r--src/util/css_const.h.in67
-rw-r--r--src/util/encoding.cc182
-rw-r--r--src/util/encoding.h41
-rw-r--r--src/util/ffw.c485
-rw-r--r--src/util/ffw.h74
-rw-r--r--src/util/math.cc90
-rw-r--r--src/util/math.h59
-rw-r--r--src/util/mingw.cc64
-rw-r--r--src/util/mingw.h29
-rw-r--r--src/util/misc.cc66
-rw-r--r--src/util/misc.h39
-rw-r--r--src/util/namespace.h21
-rw-r--r--src/util/path.cc141
-rw-r--r--src/util/path.h33
-rw-r--r--src/util/unicode.cc70
-rw-r--r--src/util/unicode.h84
60 files changed, 9571 insertions, 0 deletions
diff --git a/src/ArgParser.cc b/src/ArgParser.cc
new file mode 100644
index 0000000..19dcf32
--- /dev/null
+++ b/src/ArgParser.cc
@@ -0,0 +1,176 @@
+/*
+ * A wrapper of getopt
+ *
+ * by WangLu
+ * 2012.09.10
+ */
+
+#include <iostream>
+#include <unordered_map>
+#include <cassert>
+
+#include <getopt.h>
+
+#include "ArgParser.h"
+
+namespace pdf2htmlEX {
+
+using std::ostream;
+using std::cerr;
+using std::endl;
+using std::string;
+using std::vector;
+using std::unordered_map;
+using std::make_pair;
+using std::ostringstream;
+
+bool read_value(const char * arg, char * location)
+{
+ *location = arg[0];
+ return (arg[1] == 0);
+}
+
+bool read_value(const char * arg, std::string * location)
+{
+ *location = std::string(arg);
+ return true;
+}
+
+void dump_value(std::ostream & out, const std::string & v)
+{
+ out << '"' << v << '"';
+}
+
+ArgParser & ArgParser::add(const char * optname, const char * description, ArgParserCallBack callback, bool need_arg)
+{
+ // ArgEntry does not accept nullptr as optname nor description
+ if((!optname) || (!optname[0]))
+ {
+ // when optname is nullptr or "", it's optional, and description is dropped
+ optional_arg_entries.emplace_back(new ArgEntry<string, string>("", "", callback, need_arg));
+ }
+ else
+ {
+ arg_entries.emplace_back(new ArgEntry<string, string>(optname, (description ? description : ""), callback, need_arg));
+ }
+
+ return *this;
+}
+
+void ArgParser::parse(int argc, char ** argv) const
+{
+ //prepare optstring and longopts
+ vector<char> optstring;
+ optstring.reserve(2*arg_entries.size() + 1);
+ vector<struct option> longopts;
+ longopts.reserve(arg_entries.size() + 1);
+
+ unordered_map<int, const ArgEntryBase*> opt_map;
+
+ for(auto iter = arg_entries.begin(); iter != arg_entries.end(); ++iter)
+ {
+ const auto * p = iter->get();
+ if(p->shortname != 0)
+ {
+ optstring.push_back(p->shortname);
+ if(p->need_arg)
+ optstring.push_back(':');
+
+ int v = p->shortname;
+ if(!(opt_map.insert(make_pair(v, p)).second))
+ {
+ cerr << "Warning: duplicated shortname: " << v << endl;
+ }
+ }
+
+ if(p->name != "")
+ {
+ int v = (256 + (iter - arg_entries.begin()));
+ longopts.resize(longopts.size() + 1);
+ {
+ auto & cur = longopts.back();
+ cur.name = p->name.c_str();
+ cur.has_arg = ((p->need_arg) ? required_argument : no_argument);
+ cur.flag = nullptr;
+ cur.val = v;
+ }
+ if(!(opt_map.insert(make_pair(v, p)).second))
+ {
+ cerr << "Warning: duplicated long name: " << (p->name) << endl;
+ }
+ }
+ }
+
+ optstring.push_back(0);
+ longopts.resize(longopts.size() + 1);
+ {
+ auto & cur = longopts.back();
+ cur.name = 0;
+ cur.has_arg = 0;
+ cur.flag = 0;
+ cur.val = 0;
+ }
+
+ {
+ opterr = 1;
+ int r;
+ int idx;
+ while(true)
+ {
+ r = getopt_long(argc, argv, &optstring.front(), &longopts.front(), &idx);
+ if(r == -1)
+ break;
+ assert(r != ':');
+ if(r == '?')
+ {
+ throw "";
+ }
+
+ auto iter = opt_map.find(r);
+ assert(iter != opt_map.end());
+ iter->second->parse(optarg);
+ }
+ }
+
+ {
+ auto iter = optional_arg_entries.begin();
+ while((optind < argc) && (iter != optional_arg_entries.end()))
+ {
+ (*(iter++))->parse(argv[optind++]);
+ }
+ }
+}
+
+void ArgParser::show_usage(ostream & out) const
+{
+ for(auto & entry : arg_entries)
+ {
+ entry->show_usage(out);
+ }
+}
+
+template<> const char * ArgParser::get_type_name<int> (void) { return "int"; }
+template<> const char * ArgParser::get_type_name<double> (void) { return "fp"; }
+template<> const char * ArgParser::get_type_name<string> (void) { return "string"; }
+
+ArgParser::ArgEntryBase::ArgEntryBase(const char * name, const char * description, bool need_arg)
+ : shortname(0), name(name), description(description), need_arg(need_arg)
+{
+ size_t idx = this->name.rfind(',');
+ if(idx != string::npos)
+ {
+ if(idx+2 == this->name.size())
+ {
+ shortname = this->name[this->name.size()-1];
+ this->name = this->name.substr(0, idx);
+ }
+ else
+ {
+ cerr << "Warning: argument '" << this->name << "' cannot be parsed as a short option" << endl;
+ }
+ }
+}
+
+const int ArgParser::arg_col_width = 31;
+
+} // namespace pdf2htmlEX
diff --git a/src/ArgParser.h b/src/ArgParser.h
new file mode 100644
index 0000000..c0f8cde
--- /dev/null
+++ b/src/ArgParser.h
@@ -0,0 +1,219 @@
+/*
+ * A wrapper of getopt
+ *
+ * by WangLu
+ * 2012.09.10
+ */
+
+
+#ifndef ARGPARSER_H__
+#define ARGPARSER_H__
+
+#include <string>
+#include <vector>
+#include <ostream>
+#include <sstream>
+#include <memory>
+
+#ifndef nullptr
+#define nullptr (NULL)
+#endif
+
+namespace pdf2htmlEX {
+
+//helper
+template<class T>
+bool read_value(const char * arg, T * location)
+{
+ std::istringstream sin(arg);
+ return ((sin >> (*location)) && (sin.eof()));
+}
+
+extern bool read_value(const char * arg, char * location);
+extern bool read_value(const char * arg, std::string * location);
+
+template<class T>
+void dump_value(std::ostream & out, const T & v)
+{
+ out << v;
+}
+
+extern void dump_value(std::ostream & out, const std::string & v);
+
+class ArgParser
+{
+public:
+ typedef void (*ArgParserCallBack) (const char * arg);
+
+ /*
+ * The 1st is for arguments with callbacks(i.e. flags)
+ * The 2nd is for arguments linked to variables
+ *
+ * optname:
+ * - if not nullptr, it should be the name of the arg, should be in the format of "<long name>[,<short char>]", e.g. "help,h"
+ * - if nullptr, it denotes an optional arg, and description will be ignored
+ * description:
+ * - if description is nullptr or "", the argument won't be shown in show_usage()
+ *
+ * location:
+ * - if not nullptr, the argument for this arg is stored there
+ * - if nullptr, this arg does not need arguments
+ */
+ ArgParser & add(const char * optname, const char * description, ArgParserCallBack callback, bool need_arg = false);
+ template <class T, class Tv>
+ ArgParser & add(const char * optname, T * location, const Tv & default_value, const char * description, bool dont_show_default = false);
+
+ void parse(int argc, char ** argv) const;
+ void show_usage(std::ostream & out) const;
+
+private:
+ // type names helper
+ template<class>
+ static const char * get_type_name(void) { return "unknown"; }
+
+ struct ArgEntryBase
+ {
+ /* name or description cannot be nullptr */
+ ArgEntryBase(const char * name, const char * description, bool need_arg);
+ virtual ~ArgEntryBase() { }
+ char shortname;
+ std::string name;
+ std::string description;
+ bool need_arg;
+ virtual void parse (const char * arg) const = 0;
+ virtual void show_usage (std::ostream & out) const = 0;
+ };
+
+ template <class T, class Tv>
+ struct ArgEntry : public ArgEntryBase
+ {
+ ArgEntry(const char * name,
+ const char * description,
+ ArgParserCallBack callback,
+ bool need_arg);
+
+ ArgEntry(const char * name,
+ T * location, const Tv & default_value,
+ const char * description, bool dont_show_default);
+
+ virtual void parse (const char * arg) const;
+ virtual void show_usage (std::ostream & out) const;
+
+ private:
+ T * location;
+ T default_value;
+ ArgParserCallBack callback;
+ bool dont_show_default;
+ };
+
+ std::vector<std::unique_ptr<ArgEntryBase>> arg_entries, optional_arg_entries;
+ static const int arg_col_width;
+};
+
+template<class T, class Tv>
+ArgParser & ArgParser::add(const char * optname, T * location, const Tv & default_value, const char * description, bool dont_show_default)
+{
+ // ArgEntry does not accept nullptr as optname nor description
+ if((!optname) || (!optname[0]))
+ {
+ // when optname is nullptr or "", it's optional, and description is dropped
+ optional_arg_entries.emplace_back(new ArgEntry<T, Tv>("", location, default_value, "", dont_show_default));
+ }
+ else
+ {
+ arg_entries.emplace_back(new ArgEntry<T, Tv>(optname, location, default_value, (description ? description : ""), dont_show_default));
+ }
+
+ return *this;
+}
+
+// Known types
+template<> const char * ArgParser::get_type_name<int> (void);
+template<> const char * ArgParser::get_type_name<double> (void);
+template<> const char * ArgParser::get_type_name<std::string> (void);
+
+template<class T, class Tv>
+ArgParser::ArgEntry<T, Tv>::ArgEntry(const char * name, const char * description, ArgParserCallBack callback, bool need_arg)
+ : ArgEntryBase(name, description, need_arg)
+ , location(nullptr)
+ , default_value()
+ , callback(callback)
+ , dont_show_default(true)
+{
+}
+
+template<class T, class Tv>
+ArgParser::ArgEntry<T, Tv>::ArgEntry(const char * name, T * location, const Tv & default_value, const char * description, bool dont_show_default)
+ : ArgEntryBase(name, description, (location != nullptr))
+ , location(location)
+ , default_value(default_value)
+ , callback(nullptr)
+ , dont_show_default(dont_show_default)
+{
+ if(need_arg)
+ *location = T(default_value);
+}
+
+template<class T, class Tv>
+void ArgParser::ArgEntry<T, Tv>::parse(const char * arg) const
+{
+ if(need_arg)
+ {
+ if(!arg)
+ throw std::string("Missing argument of option: --") + name;
+
+ if((location != nullptr) && (!read_value(arg, location)))
+ throw std::string("Invalid argument: ") + arg;
+ }
+
+ if(callback)
+ (*callback)(arg);
+}
+
+template<class T, class Tv>
+void ArgParser::ArgEntry<T, Tv>::show_usage(std::ostream & out) const
+{
+ if(description.empty())
+ return;
+
+ std::ostringstream sout;
+ sout << " ";
+
+ if(shortname != 0)
+ {
+ sout << "-" << shortname;
+ }
+
+ if(name != "")
+ {
+ if(shortname != 0)
+ sout << ",";
+ sout << "--" << name;
+ }
+
+ if(need_arg)
+ {
+ sout << " <" << get_type_name<T>() << ">";
+ }
+
+ std::string s = sout.str();
+ out << s;
+
+ for(int i = s.size(); i < arg_col_width; ++i)
+ out << ' ';
+
+ out << " " << description;
+
+ if(need_arg && !dont_show_default)
+ {
+ out << " (default: ";
+ dump_value(out, default_value);
+ out << ")";
+ }
+
+ out << std::endl;
+}
+
+} // namespace ArgParser
+
+#endif //ARGPARSER_H__
diff --git a/src/BackgroundRenderer/BackgroundRenderer.cc b/src/BackgroundRenderer/BackgroundRenderer.cc
new file mode 100644
index 0000000..dbd7137
--- /dev/null
+++ b/src/BackgroundRenderer/BackgroundRenderer.cc
@@ -0,0 +1,130 @@
+/*
+ * Background renderer
+ * Render all those things not supported as Image
+ *
+ * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#include <poppler-config.h>
+
+#include "HTMLRenderer/HTMLRenderer.h"
+#include "Param.h"
+
+#include "BackgroundRenderer.h"
+#include "SplashBackgroundRenderer.h"
+#if ENABLE_SVG
+#include "CairoBackgroundRenderer.h"
+#endif
+
+namespace pdf2htmlEX {
+
+std::unique_ptr<BackgroundRenderer> BackgroundRenderer::getBackgroundRenderer(const std::string & format, HTMLRenderer * html_renderer, const Param & param)
+{
+#ifdef ENABLE_LIBPNG
+ if(format == "png")
+ {
+ return std::unique_ptr<BackgroundRenderer>(new SplashBackgroundRenderer(format, html_renderer, param));
+ }
+#endif
+#ifdef ENABLE_LIBJPEG
+ if(format == "jpg")
+ {
+ return std::unique_ptr<BackgroundRenderer>(new SplashBackgroundRenderer(format, html_renderer, param));
+ }
+#endif
+#if ENABLE_SVG
+ if (format == "svg")
+ {
+ return std::unique_ptr<BackgroundRenderer>(new CairoBackgroundRenderer(html_renderer, param));
+ }
+#endif
+
+ return nullptr;
+}
+
+std::unique_ptr<BackgroundRenderer> BackgroundRenderer::getFallbackBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param)
+{
+ if (param.bg_format == "svg" && param.svg_node_count_limit >= 0)
+ return std::unique_ptr<BackgroundRenderer>(new SplashBackgroundRenderer("", html_renderer, param));
+ return nullptr;
+}
+
+void BackgroundRenderer::proof_begin_text_object(GfxState *state, OutputDev * dev)
+{
+ if (!proof_state)
+ {
+ PDFRectangle rect(0, 0, state->getPageWidth(), state->getPageHeight());
+ proof_state.reset(new GfxState(state->getHDPI(), state->getVDPI(), &rect, state->getRotate(), dev->upsideDown()));
+ proof_state->setFillColorSpace(new GfxDeviceRGBColorSpace());
+ proof_state->setStrokeColorSpace(new GfxDeviceRGBColorSpace());
+ }
+
+ // Save original render mode in proof_state, and restore in proof_end_text_object()
+ // This is due to poppler's OutputDev::updateRender() actually has no effect, we have to
+ // modify state directly, see proof_begin_string().
+ proof_state->setRender(state->getRender());
+}
+
+void BackgroundRenderer::proof_begin_string(GfxState *state, OutputDev * dev)
+{
+ int render = proof_state->getRender();
+ if (render == 3) // hidden
+ return;
+
+ double lx = state->getFontSize() / 70, ly = lx;
+ tm_transform(state->getTextMat(), lx, ly, true);
+ proof_state->setLineWidth(sqrt(lx * lx + ly * ly));
+
+ static const Color red(1, 0, 0), green(0, 1, 0), blue(0, 0, 1), yellow(1, 1, 0), white(1, 1, 1);
+ Color fc, sc;
+ const Color *pfc, *psc;
+ state->getFillRGB(&fc.rgb);
+ state->getStrokeRGB(&sc.rgb);
+
+ if (render == 0 || render == 2) //has fill
+ pfc = fc.distance(red) > 0.4 ? &red : &green;
+ else
+ pfc = &red;
+
+ if (render == 1 || render == 2) // has stroke
+ psc = sc.distance(blue) > 0.4 ? &blue : &yellow;
+ else if(render == 0) // fill only
+ psc = &white;
+ else
+ psc = &blue;
+
+ GfxColor gfc, gsc;
+ pfc->get_gfx_color(gfc);
+ psc->get_gfx_color(gsc);
+ proof_state->setFillColor(&gfc);
+ proof_state->setStrokeColor(&gsc);
+
+ if (state->getFillColorSpace()->getMode() != csDeviceRGB)
+ dev->updateFillColorSpace(proof_state.get());
+ if (state->getStrokeColorSpace()->getMode() != csDeviceRGB)
+ dev->updateStrokeColorSpace(proof_state.get());
+
+ dev->updateLineWidth(proof_state.get());
+ dev->updateFillColor(proof_state.get());
+ dev->updateStrokeColor(proof_state.get());
+
+ state->setRender(2); // fill & stroke
+}
+
+void BackgroundRenderer::proof_end_text_object(GfxState *state, OutputDev * dev)
+{
+ state->setRender(proof_state->getRender());
+ dev->updateLineWidth(state);
+ dev->updateFillColorSpace(state);
+ dev->updateStrokeColorSpace(state);
+ dev->updateFillColor(state);
+ dev->updateStrokeColor(state);
+}
+
+void BackgroundRenderer::proof_update_render(GfxState *state, OutputDev * dev)
+{
+ // Save render mode in proof_state in cases it is changed inside a text object
+ proof_state->setRender(state->getRender());
+}
+
+} // namespace pdf2htmlEX
diff --git a/src/BackgroundRenderer/BackgroundRenderer.h b/src/BackgroundRenderer/BackgroundRenderer.h
new file mode 100644
index 0000000..2927484
--- /dev/null
+++ b/src/BackgroundRenderer/BackgroundRenderer.h
@@ -0,0 +1,52 @@
+/*
+ * Background renderer
+ * Render all those things not supported as Image
+ *
+ * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+
+#ifndef BACKGROUND_RENDERER_H__
+#define BACKGROUND_RENDERER_H__
+
+#include <string>
+#include <memory>
+
+class PDFDoc;
+class GfxState;
+class OutputDev;
+
+namespace pdf2htmlEX {
+
+class Param;
+class HTMLRenderer;
+class BackgroundRenderer
+{
+public:
+ // return nullptr upon failure
+ static std::unique_ptr<BackgroundRenderer> getBackgroundRenderer(const std::string & format, HTMLRenderer * html_renderer, const Param & param);
+ // Return a fallback bg renderer according to param.bg_format.
+ // Currently only svg bg format might need a bitmap fallback.
+ static std::unique_ptr<BackgroundRenderer> getFallbackBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param);
+
+ BackgroundRenderer() {}
+ virtual ~BackgroundRenderer() {}
+
+ virtual void init(PDFDoc * doc) = 0;
+ //return true on success, false otherwise (e.g. need a fallback)
+ virtual bool render_page(PDFDoc * doc, int pageno) = 0;
+ virtual void embed_image(int pageno) = 0;
+
+ // for proof output
+protected:
+ void proof_begin_text_object(GfxState * state, OutputDev * dev);
+ void proof_begin_string(GfxState * state, OutputDev * dev);
+ void proof_end_text_object(GfxState * state, OutputDev * dev);
+ void proof_update_render(GfxState * state, OutputDev * dev);
+private:
+ std::unique_ptr<GfxState> proof_state;
+};
+
+} // namespace pdf2htmlEX
+
+#endif //BACKGROUND_RENDERER_H__
diff --git a/src/BackgroundRenderer/CairoBackgroundRenderer.cc b/src/BackgroundRenderer/CairoBackgroundRenderer.cc
new file mode 100644
index 0000000..1ce6eac
--- /dev/null
+++ b/src/BackgroundRenderer/CairoBackgroundRenderer.cc
@@ -0,0 +1,311 @@
+/*
+ * CairoBackgroundRenderer.cc
+ *
+ * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#include <string>
+#include <fstream>
+
+
+#include "pdf2htmlEX-config.h"
+
+#include "Base64Stream.h"
+
+#if ENABLE_SVG
+
+#include "CairoBackgroundRenderer.h"
+#include "SplashBackgroundRenderer.h"
+
+namespace pdf2htmlEX {
+
+using std::string;
+using std::ifstream;
+using std::ofstream;
+using std::vector;
+using std::unordered_map;
+
+CairoBackgroundRenderer::CairoBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param)
+ : CairoOutputDev()
+ , html_renderer(html_renderer)
+ , param(param)
+ , surface(nullptr)
+{ }
+
+CairoBackgroundRenderer::~CairoBackgroundRenderer()
+{
+ for(auto const& p : bitmaps_ref_count)
+ {
+ if (p.second == 0)
+ {
+ html_renderer->tmp_files.add(this->build_bitmap_path(p.first));
+ }
+ }
+}
+
+void CairoBackgroundRenderer::drawChar(GfxState *state, double x, double y,
+ double dx, double dy,
+ double originX, double originY,
+ CharCode code, int nBytes, Unicode *u, int uLen)
+{
+ // draw characters as image when
+ // - in fallback mode
+ // - OR there is special filling method
+ // - OR using a writing mode font
+ // - OR using a Type 3 font while param.process_type3 is not enabled
+ // - OR the text is used as path
+ if((param.fallback || param.proof)
+ || ( (state->getFont())
+ && ( (state->getFont()->getWMode())
+ || ((state->getFont()->getType() == fontType3) && (!param.process_type3))
+ || (state->getRender() >= 4)
+ )
+ )
+ )
+ {
+ CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
+ }
+ // If a char is treated as image, it is not subject to cover test
+ // (see HTMLRenderer::drawString), so don't increase drawn_char_count.
+ else if (param.correct_text_visibility) {
+ if (html_renderer->is_char_covered(drawn_char_count))
+ CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
+ drawn_char_count++;
+ }
+}
+
+void CairoBackgroundRenderer::beginTextObject(GfxState *state)
+{
+ if (param.proof == 2)
+ proof_begin_text_object(state, this);
+ CairoOutputDev::beginTextObject(state);
+}
+
+void CairoBackgroundRenderer::beginString(GfxState *state, GooString * str)
+{
+ if (param.proof == 2)
+ proof_begin_string(state, this);
+ CairoOutputDev::beginString(state, str);
+}
+
+void CairoBackgroundRenderer::endTextObject(GfxState *state)
+{
+ if (param.proof == 2)
+ proof_end_text_object(state, this);
+ CairoOutputDev::endTextObject(state);
+}
+
+void CairoBackgroundRenderer::updateRender(GfxState *state)
+{
+ if (param.proof == 2)
+ proof_update_render(state, this);
+ CairoOutputDev::updateRender(state);
+}
+
+void CairoBackgroundRenderer::init(PDFDoc * doc)
+{
+ startDoc(doc);
+}
+
+static GBool annot_cb(Annot *, void * pflag) {
+ return (*((bool*)pflag)) ? gTrue : gFalse;
+};
+
+bool CairoBackgroundRenderer::render_page(PDFDoc * doc, int pageno)
+{
+ drawn_char_count = 0;
+ double page_width;
+ double page_height;
+ if(param.use_cropbox)
+ {
+ page_width = doc->getPageCropWidth(pageno);
+ page_height = doc->getPageCropHeight(pageno);
+ }
+ else
+ {
+ page_width = doc->getPageMediaWidth(pageno);
+ page_height = doc->getPageMediaHeight(pageno);
+ }
+
+ if (doc->getPageRotate(pageno) == 90 || doc->getPageRotate(pageno) == 270)
+ std::swap(page_height, page_width);
+
+ string fn = (char*)html_renderer->str_fmt("%s/bg%x.svg", (param.embed_image ? param.tmp_dir : param.dest_dir).c_str(), pageno);
+ if(param.embed_image)
+ html_renderer->tmp_files.add(fn);
+
+ surface = cairo_svg_surface_create(fn.c_str(), page_width * param.h_dpi / DEFAULT_DPI, page_height * param.v_dpi / DEFAULT_DPI);
+ cairo_svg_surface_restrict_to_version(surface, CAIRO_SVG_VERSION_1_2);
+ cairo_surface_set_fallback_resolution(surface, param.h_dpi, param.v_dpi);
+
+ cairo_t * cr = cairo_create(surface);
+ setCairo(cr);
+
+ bitmaps_in_current_page.clear();
+
+ bool process_annotation = param.process_annotation;
+ doc->displayPage(this, pageno, param.h_dpi, param.v_dpi,
+ 0,
+ (!(param.use_cropbox)),
+ false,
+ false,
+ nullptr, nullptr, &annot_cb, &process_annotation);
+
+ setCairo(nullptr);
+
+ {
+ auto status = cairo_status(cr);
+ cairo_destroy(cr);
+ if(status)
+ throw string("Cairo error: ") + cairo_status_to_string(status);
+ }
+
+ cairo_surface_finish(surface);
+ {
+ auto status = cairo_surface_status(surface);
+ cairo_surface_destroy(surface);
+ surface = nullptr;
+ if(status)
+ throw string("Error in cairo: ") + cairo_status_to_string(status);
+ }
+
+ //check node count in the svg file, fall back to bitmap_renderer if necessary.
+ if (param.svg_node_count_limit >= 0)
+ {
+ int n = 0;
+ char c;
+ ifstream svgfile(fn);
+ //count of '<' in the file should be an approximation of node count.
+ while(svgfile >> c)
+ {
+ if (c == '<')
+ ++n;
+ if (n > param.svg_node_count_limit)
+ {
+ html_renderer->tmp_files.add(fn);
+ return false;
+ }
+ }
+ }
+
+ // the svg file is actually used, so add its bitmaps' ref count.
+ for (auto id : bitmaps_in_current_page)
+ ++bitmaps_ref_count[id];
+
+ return true;
+}
+
+void CairoBackgroundRenderer::embed_image(int pageno)
+{
+ auto & f_page = *(html_renderer->f_curpage);
+
+ // SVGs introduced by <img> or background-image can't have external resources;
+ // SVGs introduced by <embed> and <object> can, but they are more expensive for browsers.
+ // So we use <img> if the SVG contains no external bitmaps, and use <embed> otherwise.
+ // See also:
+ // https://developer.mozilla.org/en-US/docs/Web/SVG/SVG_as_an_Image
+ // http://stackoverflow.com/questions/4476526/do-i-use-img-object-or-embed-for-svg-files
+
+ if (param.svg_embed_bitmap || bitmaps_in_current_page.empty())
+ f_page << "<img";
+ else
+ f_page << "<embed";
+
+ f_page << " class=\"" << CSS::FULL_BACKGROUND_IMAGE_CN
+ << "\" alt=\"\" src=\"";
+
+ if(param.embed_image)
+ {
+ auto path = html_renderer->str_fmt("%s/bg%x.svg", param.tmp_dir.c_str(), pageno);
+ ifstream fin((char*)path, ifstream::binary);
+ if(!fin)
+ throw string("Cannot read background image ") + (char*)path;
+ f_page << "data:image/svg+xml;base64," << Base64Stream(fin);
+ }
+ else
+ {
+ f_page << (char*)html_renderer->str_fmt("bg%x.svg", pageno);
+ }
+ f_page << "\"/>";
+}
+
+string CairoBackgroundRenderer::build_bitmap_path(int id)
+{
+ // "o" for "PDF Object"
+ return string(html_renderer->str_fmt("%s/o%d.jpg", param.dest_dir.c_str(), id));
+}
+// Override CairoOutputDev::setMimeData() and dump bitmaps in SVG to external files.
+void CairoBackgroundRenderer::setMimeData(Stream *str, Object *ref, cairo_surface_t *image)
+{
+ if (param.svg_embed_bitmap)
+ {
+ CairoOutputDev::setMimeData(str, ref, image);
+ return;
+ }
+
+ // TODO dump bitmaps in other formats.
+ if (str->getKind() != strDCT)
+ return;
+
+ // TODO inline image?
+ if (ref == nullptr || !ref->isRef())
+ return;
+
+ // We only dump rgb or gray jpeg without /Decode array.
+ //
+ // Although jpeg support CMYK, PDF readers do color conversion incompatibly with most other
+ // programs (including browsers): other programs invert CMYK color if 'Adobe' marker (app14) presents
+ // in a jpeg file; while PDF readers don't, they solely rely on /Decode array to invert color.
+ // It's a bit complicated to decide whether a CMYK jpeg is safe to dump, so we don't dump at all.
+ // See also:
+ // JPEG file embedded in PDF (CMYK) https://forums.adobe.com/thread/975777
+ // http://stackoverflow.com/questions/3123574/how-to-convert-from-cmyk-to-rgb-in-java-correctly
+ //
+ // In PDF, jpeg stream objects can also specify other color spaces like DeviceN and Separation,
+ // It is also not safe to dump them directly.
+ Object obj;
+ str->getDict()->lookup("ColorSpace", &obj);
+ if (!obj.isName() || (strcmp(obj.getName(), "DeviceRGB") && strcmp(obj.getName(), "DeviceGray")) )
+ {
+ obj.free();
+ return;
+ }
+ obj.free();
+ str->getDict()->lookup("Decode", &obj);
+ if (obj.isArray())
+ {
+ obj.free();
+ return;
+ }
+ obj.free();
+
+ int imgId = ref->getRef().num;
+ auto uri = strdup((char*) html_renderer->str_fmt("o%d.jpg", imgId));
+ auto st = cairo_surface_set_mime_data(image, CAIRO_MIME_TYPE_URI,
+ (unsigned char*) uri, strlen(uri), free, uri);
+ if (st)
+ {
+ free(uri);
+ return;
+ }
+ bitmaps_in_current_page.push_back(imgId);
+
+ if(bitmaps_ref_count.find(imgId) != bitmaps_ref_count.end())
+ return;
+
+ bitmaps_ref_count[imgId] = 0;
+
+ char *strBuffer;
+ int len;
+ if (getStreamData(str->getNextStream(), &strBuffer, &len))
+ {
+ ofstream imgfile(build_bitmap_path(imgId), ofstream::binary);
+ imgfile.write(strBuffer, len);
+ free(strBuffer);
+ }
+}
+
+} // namespace pdf2htmlEX
+
+#endif // ENABLE_SVG
+
diff --git a/src/BackgroundRenderer/CairoBackgroundRenderer.h b/src/BackgroundRenderer/CairoBackgroundRenderer.h
new file mode 100644
index 0000000..4ed9c86
--- /dev/null
+++ b/src/BackgroundRenderer/CairoBackgroundRenderer.h
@@ -0,0 +1,75 @@
+/*
+ * Cairo Background renderer
+ * Render all those things not supported as Image, with Cairo
+ *
+ * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+
+#ifndef CAIRO_BACKGROUND_RENDERER_H__
+#define CAIRO_BACKGROUND_RENDERER_H__
+
+#include <CairoOutputDev.h>
+#include <cairo.h>
+#include <cairo-svg.h>
+#include <unordered_map>
+#include <vector>
+#include <string>
+
+#include "pdf2htmlEX-config.h"
+
+#include "Param.h"
+#include "HTMLRenderer/HTMLRenderer.h"
+
+namespace pdf2htmlEX {
+
+// Based on BackgroundRenderer from poppler
+class CairoBackgroundRenderer : public BackgroundRenderer, CairoOutputDev
+{
+public:
+ CairoBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param);
+
+ virtual ~CairoBackgroundRenderer();
+
+ virtual void init(PDFDoc * doc);
+ virtual bool render_page(PDFDoc * doc, int pageno);
+ virtual void embed_image(int pageno);
+
+ // Does this device use beginType3Char/endType3Char? Otherwise,
+ // text in Type 3 fonts will be drawn with drawChar/drawString.
+ virtual GBool interpretType3Chars() { return !param.process_type3; }
+
+ virtual void drawChar(GfxState *state, double x, double y,
+ double dx, double dy,
+ double originX, double originY,
+ CharCode code, int nBytes, Unicode *u, int uLen);
+
+ //for proof
+ void beginTextObject(GfxState *state);
+ void beginString(GfxState *state, GooString * str);
+ void endTextObject(GfxState *state);
+ void updateRender(GfxState *state);
+
+protected:
+ virtual void setMimeData(Stream *str, Object *ref, cairo_surface_t *image);
+
+protected:
+ HTMLRenderer * html_renderer;
+ const Param & param;
+ cairo_surface_t * surface;
+
+private:
+ // convert bitmap stream id to bitmap file name. No pageno prefix,
+ // because a bitmap may be shared by multiple pages.
+ std::string build_bitmap_path(int id);
+ // map<id_of_bitmap_stream, usage_count_in_all_svgs>
+ // note: if a svg bg fallbacks to bitmap bg, its bitmaps are not taken into account.
+ std::unordered_map<int, int> bitmaps_ref_count;
+ // id of bitmaps' stream used by current page
+ std::vector<int> bitmaps_in_current_page;
+ int drawn_char_count;
+};
+
+}
+
+#endif //CAIRO_BACKGROUND_RENDERER_H__
diff --git a/src/BackgroundRenderer/SplashBackgroundRenderer.cc b/src/BackgroundRenderer/SplashBackgroundRenderer.cc
new file mode 100644
index 0000000..55b5322
--- /dev/null
+++ b/src/BackgroundRenderer/SplashBackgroundRenderer.cc
@@ -0,0 +1,261 @@
+/*
+ * SplashBackgroundRenderer.cc
+ *
+ * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#include <fstream>
+#include <vector>
+#include <memory>
+
+#include <poppler-config.h>
+#include <PDFDoc.h>
+#include <goo/ImgWriter.h>
+#include <goo/PNGWriter.h>
+#include <goo/JpegWriter.h>
+
+#include "Base64Stream.h"
+#include "util/const.h"
+
+#include "SplashBackgroundRenderer.h"
+
+namespace pdf2htmlEX {
+
+using std::string;
+using std::ifstream;
+using std::vector;
+using std::unique_ptr;
+
+const SplashColor SplashBackgroundRenderer::white = {255,255,255};
+
+SplashBackgroundRenderer::SplashBackgroundRenderer(const string & imgFormat, HTMLRenderer * html_renderer, const Param & param)
+ : SplashOutputDev(splashModeRGB8, 4, gFalse, (SplashColorPtr)(&white))
+ , html_renderer(html_renderer)
+ , param(param)
+ , format(imgFormat)
+{
+ bool supported = false;
+#ifdef ENABLE_LIBPNG
+ if (format.empty())
+ format = "png";
+ supported = supported || format == "png";
+#endif
+#ifdef ENABLE_LIBJPEG
+ if (format.empty())
+ format = "jpg";
+ supported = supported || format == "jpg";
+#endif
+ if (!supported)
+ {
+ throw string("Image format not supported: ") + format;
+ }
+}
+
+/*
+ * SplashOutputDev::startPage would paint the whole page with the background color
+ * And thus have modified region set to the whole page area
+ * We do not want that.
+ */
+void SplashBackgroundRenderer::startPage(int pageNum, GfxState *state, XRef *xrefA)
+{
+ SplashOutputDev::startPage(pageNum, state, xrefA);
+ clearModRegion();
+}
+
+void SplashBackgroundRenderer::drawChar(GfxState *state, double x, double y,
+ double dx, double dy,
+ double originX, double originY,
+ CharCode code, int nBytes, Unicode *u, int uLen)
+{
+ // draw characters as image when
+ // - in fallback mode
+ // - OR there is special filling method
+ // - OR using a writing mode font
+ // - OR using a Type 3 font while param.process_type3 is not enabled
+ // - OR the text is used as path
+ if((param.fallback || param.proof)
+ || ( (state->getFont())
+ && ( (state->getFont()->getWMode())
+ || ((state->getFont()->getType() == fontType3) && (!param.process_type3))
+ || (state->getRender() >= 4)
+ )
+ )
+ )
+ {
+ SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
+ }
+ // If a char is treated as image, it is not subject to cover test
+ // (see HTMLRenderer::drawString), so don't increase drawn_char_count.
+ else if (param.correct_text_visibility) {
+ if (html_renderer->is_char_covered(drawn_char_count))
+ SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
+ drawn_char_count++;
+ }
+}
+
+void SplashBackgroundRenderer::beginTextObject(GfxState *state)
+{
+ if (param.proof == 2)
+ proof_begin_text_object(state, this);
+ SplashOutputDev::beginTextObject(state);
+}
+
+void SplashBackgroundRenderer::beginString(GfxState *state, GooString * str)
+{
+ if (param.proof == 2)
+ proof_begin_string(state, this);
+ SplashOutputDev::beginString(state, str);
+}
+
+void SplashBackgroundRenderer::endTextObject(GfxState *state)
+{
+ if (param.proof == 2)
+ proof_end_text_object(state, this);
+ SplashOutputDev::endTextObject(state);
+}
+
+void SplashBackgroundRenderer::updateRender(GfxState *state)
+{
+ if (param.proof == 2)
+ proof_update_render(state, this);
+ SplashOutputDev::updateRender(state);
+}
+
+void SplashBackgroundRenderer::init(PDFDoc * doc)
+{
+ startDoc(doc);
+}
+
+static GBool annot_cb(Annot *, void * pflag) {
+ return (*((bool*)pflag)) ? gTrue : gFalse;
+};
+
+bool SplashBackgroundRenderer::render_page(PDFDoc * doc, int pageno)
+{
+ drawn_char_count = 0;
+ bool process_annotation = param.process_annotation;
+ doc->displayPage(this, pageno, param.h_dpi, param.v_dpi,
+ 0,
+ (!(param.use_cropbox)),
+ false, false,
+ nullptr, nullptr, &annot_cb, &process_annotation);
+ return true;
+}
+
+void SplashBackgroundRenderer::embed_image(int pageno)
+{
+ // xmin->xmax is top->bottom
+ int xmin, xmax, ymin, ymax;
+ getModRegion(&xmin, &ymin, &xmax, &ymax);
+
+ // dump the background image only when it is not empty
+ if((xmin <= xmax) && (ymin <= ymax))
+ {
+ {
+ auto fn = html_renderer->str_fmt("%s/bg%x.%s", (param.embed_image ? param.tmp_dir : param.dest_dir).c_str(), pageno, format.c_str());
+ if(param.embed_image)
+ html_renderer->tmp_files.add((char*)fn);
+
+ dump_image((char*)fn, xmin, ymin, xmax, ymax);
+ }
+
+ double h_scale = html_renderer->text_zoom_factor() * DEFAULT_DPI / param.h_dpi;
+ double v_scale = html_renderer->text_zoom_factor() * DEFAULT_DPI / param.v_dpi;
+
+ auto & f_page = *(html_renderer->f_curpage);
+ auto & all_manager = html_renderer->all_manager;
+
+ f_page << "<img class=\"" << CSS::BACKGROUND_IMAGE_CN
+ << " " << CSS::LEFT_CN << all_manager.left.install(((double)xmin) * h_scale)
+ << " " << CSS::BOTTOM_CN << all_manager.bottom.install(((double)getBitmapHeight() - 1 - ymax) * v_scale)
+ << " " << CSS::WIDTH_CN << all_manager.width.install(((double)(xmax - xmin + 1)) * h_scale)
+ << " " << CSS::HEIGHT_CN << all_manager.height.install(((double)(ymax - ymin + 1)) * v_scale)
+ << "\" alt=\"\" src=\"";
+
+ if(param.embed_image)
+ {
+ auto path = html_renderer->str_fmt("%s/bg%x.%s", param.tmp_dir.c_str(), pageno, format.c_str());
+ ifstream fin((char*)path, ifstream::binary);
+ if(!fin)
+ throw string("Cannot read background image ") + (char*)path;
+
+ auto iter = FORMAT_MIME_TYPE_MAP.find(format);
+ if(iter == FORMAT_MIME_TYPE_MAP.end())
+ throw string("Image format not supported: ") + format;
+
+ string mime_type = iter->second;
+ f_page << "data:" << mime_type << ";base64," << Base64Stream(fin);
+ }
+ else
+ {
+ f_page << (char*)html_renderer->str_fmt("bg%x.%s", pageno, format.c_str());
+ }
+ f_page << "\"/>";
+ }
+}
+
+// There might be mem leak when exception is thrown !
+void SplashBackgroundRenderer::dump_image(const char * filename, int x1, int y1, int x2, int y2)
+{
+ int width = x2 - x1 + 1;
+ int height = y2 - y1 + 1;
+ if((width <= 0) || (height <= 0))
+ throw "Bad metric for background image";
+
+ FILE * f = fopen(filename, "wb");
+ if(!f)
+ throw string("Cannot open file for background image " ) + filename;
+
+ // use unique_ptr to auto delete the object upon exception
+ unique_ptr<ImgWriter> writer;
+
+ if(false) { }
+#ifdef ENABLE_LIBPNG
+ else if(format == "png")
+ {
+ writer = unique_ptr<ImgWriter>(new PNGWriter);
+ }
+#endif
+#ifdef ENABLE_LIBJPEG
+ else if(format == "jpg")
+ {
+ writer = unique_ptr<ImgWriter>(new JpegWriter);
+ }
+#endif
+ else
+ {
+ throw string("Image format not supported: ") + format;
+ }
+
+ if(!writer->init(f, width, height, param.h_dpi, param.v_dpi))
+ throw "Cannot initialize image writer";
+
+ auto * bitmap = getBitmap();
+ assert(bitmap->getMode() == splashModeRGB8);
+
+ SplashColorPtr data = bitmap->getDataPtr();
+ int row_size = bitmap->getRowSize();
+
+ vector<unsigned char*> pointers;
+ pointers.reserve(height);
+ SplashColorPtr p = data + y1 * row_size + x1 * 3;
+ for(int i = 0; i < height; ++i)
+ {
+ pointers.push_back(p);
+ p += row_size;
+ }
+
+ if(!writer->writePointers(pointers.data(), height))
+ {
+ throw "Cannot write background image";
+ }
+
+ if(!writer->close())
+ {
+ throw "Cannot finish background image";
+ }
+
+ fclose(f);
+}
+
+} // namespace pdf2htmlEX
diff --git a/src/BackgroundRenderer/SplashBackgroundRenderer.h b/src/BackgroundRenderer/SplashBackgroundRenderer.h
new file mode 100644
index 0000000..067de28
--- /dev/null
+++ b/src/BackgroundRenderer/SplashBackgroundRenderer.h
@@ -0,0 +1,65 @@
+/*
+ * Splash Background renderer
+ * Render all those things not supported as Image, with Splash
+ *
+ * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+
+#ifndef SPLASH_BACKGROUND_RENDERER_H__
+#define SPLASH_BACKGROUND_RENDERER_H__
+
+#include <string>
+
+#include <splash/SplashBitmap.h>
+#include <SplashOutputDev.h>
+
+#include "pdf2htmlEX-config.h"
+
+#include "Param.h"
+#include "HTMLRenderer/HTMLRenderer.h"
+
+namespace pdf2htmlEX {
+
+// Based on BackgroundRenderer from poppler
+class SplashBackgroundRenderer : public BackgroundRenderer, SplashOutputDev
+{
+public:
+ static const SplashColor white;
+ //format: "png" or "jpg", or "" for a default format
+ SplashBackgroundRenderer(const std::string & format, HTMLRenderer * html_renderer, const Param & param);
+
+ virtual ~SplashBackgroundRenderer() { }
+
+ virtual void init(PDFDoc * doc);
+ virtual bool render_page(PDFDoc * doc, int pageno);
+ virtual void embed_image(int pageno);
+
+ // Does this device use beginType3Char/endType3Char? Otherwise,
+ // text in Type 3 fonts will be drawn with drawChar/drawString.
+ virtual GBool interpretType3Chars() { return !param.process_type3; }
+
+ virtual void startPage(int pageNum, GfxState *state, XRef *xrefA);
+
+ virtual void drawChar(GfxState *state, double x, double y,
+ double dx, double dy,
+ double originX, double originY,
+ CharCode code, int nBytes, Unicode *u, int uLen);
+
+ //for proof
+ void beginTextObject(GfxState *state);
+ void beginString(GfxState *state, GooString * str);
+ void endTextObject(GfxState *state);
+ void updateRender(GfxState *state);
+
+protected:
+ void dump_image(const char * filename, int x1, int y1, int x2, int y2);
+ HTMLRenderer * html_renderer;
+ const Param & param;
+ std::string format;
+ int drawn_char_count;
+};
+
+} // namespace pdf2htmlEX
+
+#endif // SPLASH_BACKGROUND_RENDERER_H__
diff --git a/src/Base64Stream.cc b/src/Base64Stream.cc
new file mode 100644
index 0000000..5d02aae
--- /dev/null
+++ b/src/Base64Stream.cc
@@ -0,0 +1,42 @@
+#include "Base64Stream.h"
+
+namespace pdf2htmlEX {
+
+using std::ostream;
+
+ostream & Base64Stream::dumpto(ostream & out)
+{
+ unsigned char buf[3];
+ while(in->read((char*)buf, 3))
+ {
+ out << base64_encoding[(buf[0] & 0xfc)>>2]
+ << base64_encoding[((buf[0] & 0x03)<<4) | ((buf[1] & 0xf0)>>4)]
+ << base64_encoding[((buf[1] & 0x0f)<<2) | ((buf[2] & 0xc0)>>6)]
+ << base64_encoding[(buf[2] & 0x3f)];
+ }
+ auto cnt = in->gcount();
+ if(cnt > 0)
+ {
+ for(int i = cnt; i < 3; ++i)
+ buf[i] = 0;
+
+ out << base64_encoding[(buf[0] & 0xfc)>>2]
+ << base64_encoding[((buf[0] & 0x03)<<4) | ((buf[1] & 0xf0)>>4)];
+
+ if(cnt > 1)
+ {
+ out << base64_encoding[(buf[1] & 0x0f)<<2];
+ }
+ else
+ {
+ out << '=';
+ }
+ out << '=';
+ }
+
+ return out;
+}
+
+const char * Base64Stream::base64_encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+} //namespace pdf2htmlEX
diff --git a/src/Base64Stream.h b/src/Base64Stream.h
new file mode 100644
index 0000000..759515f
--- /dev/null
+++ b/src/Base64Stream.h
@@ -0,0 +1,34 @@
+/*
+ * Base64 Encoding
+ *
+ * by WangLu
+ * 2012.11.29
+ */
+
+#ifndef BASE64STREAM_H__
+#define BASE64STREAM_H__
+
+#include <iostream>
+
+namespace pdf2htmlEX {
+
+class Base64Stream
+{
+public:
+ Base64Stream(std::istream & in) : in(&in) { }
+
+ std::ostream & dumpto(std::ostream & out);
+
+private:
+ std::istream * in;
+ static const char * base64_encoding;
+};
+
+inline
+std::ostream & operator << (std::ostream & out, Base64Stream bs)
+{
+ return bs.dumpto(out);
+}
+
+} //namespace pdf2htmlEX
+#endif //BASE64STREAM_H__
diff --git a/src/Color.cc b/src/Color.cc
new file mode 100644
index 0000000..6a344e5
--- /dev/null
+++ b/src/Color.cc
@@ -0,0 +1,51 @@
+#include <cmath>
+
+#include "Color.h"
+
+#include "util/misc.h"
+
+namespace pdf2htmlEX {
+
+using std::ostream;
+
+Color::Color()
+{
+ memset(this, 0, sizeof(Color));
+}
+
+Color::Color(double r, double g, double b, bool transparent)
+ :transparent(transparent)
+{
+ rgb.r = (GfxColorComp)(r * gfxColorComp1);
+ rgb.g = (GfxColorComp)(g * gfxColorComp1);
+ rgb.b = (GfxColorComp)(b * gfxColorComp1);
+}
+
+Color::Color(const GfxRGB& rgb)
+ :transparent(false), rgb(rgb) { }
+
+ostream & operator << (ostream & out, const Color & color)
+{
+ if(color.transparent)
+ out << "transparent";
+ else
+ out << color.rgb;
+ return out;
+}
+
+void Color::get_gfx_color(GfxColor & gc) const
+{
+ gc.c[0] = rgb.r;
+ gc.c[1] = rgb.g;
+ gc.c[2] = rgb.b;
+}
+
+double Color::distance(const Color & other) const
+{
+ double dr = (double)rgb.r - other.rgb.r,
+ dg = (double)rgb.g - other.rgb.g,
+ db = (double)rgb.b - other.rgb.b;
+ return sqrt((dr * dr + dg * dg + db * db) / (3.0 * gfxColorComp1 * gfxColorComp1));
+}
+
+} // namespace pdf2htmlEX
diff --git a/src/Color.h b/src/Color.h
new file mode 100644
index 0000000..a2d2415
--- /dev/null
+++ b/src/Color.h
@@ -0,0 +1,38 @@
+/*
+ * Header file for Color
+ * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#ifndef COLOR_H__
+#define COLOR_H__
+
+#include <ostream>
+
+#include <GfxState.h>
+
+namespace pdf2htmlEX {
+
+struct Color
+{
+ bool transparent;
+ GfxRGB rgb;
+ Color();
+ Color(double r, double g, double b, bool transparent = false);
+ Color(const GfxRGB& rgb);
+ bool operator == (const Color & c) const {
+ if(transparent != c.transparent)
+ return false;
+ if(transparent)
+ return true;
+ return ((rgb.r == c.rgb.r) && (rgb.g == c.rgb.g) && (rgb.b == c.rgb.b));
+ }
+ void get_gfx_color(GfxColor & gc) const;
+ // Color distance, [0,1].
+ double distance(const Color & other) const;
+};
+
+std::ostream & operator << (std::ostream & out, const Color & color);
+
+} // namespace pdf2htmlEX
+
+#endif // COLOR_H__
diff --git a/src/CoveredTextDetector.cc b/src/CoveredTextDetector.cc
new file mode 100644
index 0000000..e109b3f
--- /dev/null
+++ b/src/CoveredTextDetector.cc
@@ -0,0 +1,51 @@
+/*
+ * CoveredTextDetector.cc
+ *
+ * Created on: 2014-6-14
+ * Author: duanyao
+ */
+
+#include "CoveredTextDetector.h"
+
+#include "util/math.h"
+
+namespace pdf2htmlEX {
+
+void CoveredTextDetector::reset()
+{
+ char_bboxes.clear();
+ chars_covered.clear();
+}
+
+void CoveredTextDetector::add_char_bbox(double * bbox)
+{
+ char_bboxes.insert(char_bboxes.end(), bbox, bbox + 4);
+ chars_covered.push_back(false);
+}
+
+void CoveredTextDetector::add_char_bbox_clipped(double * bbox, bool patially)
+{
+ char_bboxes.insert(char_bboxes.end(), bbox, bbox + 4);
+ chars_covered.push_back(true);
+ if (patially)
+ add_non_char_bbox(bbox, chars_covered.size() - 1);
+}
+
+void CoveredTextDetector::add_non_char_bbox(double * bbox, int index)
+{
+ if (index < 0)
+ index = chars_covered.size();
+ for (int i = 0; i < index; i++)
+ {
+ if (chars_covered[i])
+ continue;
+ double * cbbox = &char_bboxes[i * 4];
+ if (bbox_intersect(cbbox, bbox))
+ {
+ chars_covered[i] = true;
+ add_non_char_bbox(cbbox, i);
+ }
+ }
+}
+
+}
diff --git a/src/CoveredTextDetector.h b/src/CoveredTextDetector.h
new file mode 100644
index 0000000..bee6c17
--- /dev/null
+++ b/src/CoveredTextDetector.h
@@ -0,0 +1,61 @@
+/*
+ * CoveredTextDetector.h
+ *
+ * Created on: 2014-6-14
+ * Author: duanyao
+ */
+
+#ifndef COVEREDTEXTDETECTOR_H__
+#define COVEREDTEXTDETECTOR_H__
+
+#include <vector>
+
+namespace pdf2htmlEX {
+
+/**
+ * Detect characters that are covered by non-char graphics on a page.
+ */
+class CoveredTextDetector
+{
+public:
+
+ /**
+ * Reset to initial state. Should be called when start drawing a page.
+ */
+ void reset();
+
+ /**
+ * Add a drawn character's bounding box.
+ * @param bbox (x0, y0, x1, y1)
+ */
+ void add_char_bbox(double * bbox);
+
+ void add_char_bbox_clipped(double * bbox, bool patially);
+
+ /**
+ * Add a drawn non-char graphics' bounding box.
+ * If it intersects any previously drawn char's bbox, the char is marked as covered
+ * and treated as an non-char.
+ * @param bbox (x0, y0, x1, y1)
+ * @param index this graphics' drawing order: assume it is drawn after (index-1)th
+ * char. -1 means after the last char.
+ */
+ void add_non_char_bbox(double * bbox, int index = -1);
+
+ /**
+ * An array of flags indicating whether a char is covered by any non-char graphics.
+ * Index by the order that these chars are added.
+ * This vector grows as add_char_bbox() is called, so its size is the count
+ * of currently drawn chars.
+ */
+ const std::vector<bool> & get_chars_covered() { return chars_covered; }
+
+private:
+ std::vector<bool> chars_covered;
+ // x00, y00, x01, y01; x10, y10, x11, y11;...
+ std::vector<double> char_bboxes;
+};
+
+}
+
+#endif /* COVEREDTEXTDETECTOR_H__ */
diff --git a/src/DrawingTracer.cc b/src/DrawingTracer.cc
new file mode 100644
index 0000000..ffabad0
--- /dev/null
+++ b/src/DrawingTracer.cc
@@ -0,0 +1,400 @@
+/*
+ * DrawingTracer.cc
+ *
+ * Created on: 2014-6-15
+ * Author: duanyao
+ */
+
+#include "GfxFont.h"
+
+#include "util/math.h"
+#include "DrawingTracer.h"
+
+#if !ENABLE_SVG
+#warning "Cairo is disabled because ENABLE_SVG is off, --correct-text-visibility has limited functionality."
+#endif
+
+static constexpr bool DT_DEBUG = false;
+
+namespace pdf2htmlEX
+{
+
+DrawingTracer::DrawingTracer(const Param & param): param(param)
+#if ENABLE_SVG
+, cairo(nullptr)
+#endif
+{
+}
+
+DrawingTracer::~DrawingTracer()
+{
+ finish();
+}
+
+void DrawingTracer::reset(GfxState *state)
+{
+ if (!param.correct_text_visibility)
+ return;
+ finish();
+
+#if ENABLE_SVG
+ // pbox is defined in device space, which is affected by zooming;
+ // We want to trace in page space which is stable, so invert pbox by ctm.
+ double pbox[] { 0, 0, state->getPageWidth(), state->getPageHeight() };
+ Matrix ctm, ictm;
+ state->getCTM(&ctm);
+ ctm.invertTo(&ictm);
+ tm_transform_bbox(ictm.m, pbox);
+ cairo_rectangle_t page_box { pbox[0], pbox[1], pbox[2] - pbox[0], pbox[3] - pbox[1] };
+ cairo_surface_t * surface = cairo_recording_surface_create(CAIRO_CONTENT_COLOR_ALPHA, &page_box);
+ cairo = cairo_create(surface);
+ if (DT_DEBUG)
+ printf("DrawingTracer::reset:page bbox:[%f,%f,%f,%f]\n",pbox[0], pbox[1], pbox[2], pbox[3]);
+#endif
+}
+
+void DrawingTracer::finish()
+{
+#if ENABLE_SVG
+ if (cairo)
+ {
+ cairo_destroy(cairo);
+ cairo = nullptr;
+ }
+#endif
+}
+
+// Poppler won't inform us its initial CTM, and the initial CTM is affected by zoom level.
+// OutputDev::clip() may be called before OutputDev::updateCTM(), so we can't rely on GfxState::getCTM(),
+// and should trace ctm changes ourself (via cairo).
+void DrawingTracer::update_ctm(GfxState *state, double m11, double m12, double m21, double m22, double m31, double m32)
+{
+ if (!param.correct_text_visibility)
+ return;
+
+#if ENABLE_SVG
+ cairo_matrix_t matrix;
+ matrix.xx = m11;
+ matrix.yx = m12;
+ matrix.xy = m21;
+ matrix.yy = m22;
+ matrix.x0 = m31;
+ matrix.y0 = m32;
+ cairo_transform(cairo, &matrix);
+
+ if (DT_DEBUG)
+ {
+ cairo_matrix_t mat;
+ cairo_get_matrix(cairo, &mat);
+ printf("DrawingTracer::update_ctm:ctm:[%f,%f,%f,%f,%f,%f]\n", mat.xx, mat.yx, mat.xy, mat.yy, mat.x0, mat.y0);
+ }
+#endif
+}
+
+void DrawingTracer::clip(GfxState * state, bool even_odd)
+{
+ if (!param.correct_text_visibility)
+ return;
+#if ENABLE_SVG
+ do_path(state, state->getPath());
+ cairo_set_fill_rule(cairo, even_odd? CAIRO_FILL_RULE_EVEN_ODD : CAIRO_FILL_RULE_WINDING);
+ cairo_clip (cairo);
+
+ if (DT_DEBUG)
+ {
+ double cbox[4];
+ cairo_clip_extents(cairo, cbox, cbox + 1, cbox + 2, cbox + 3);
+ printf("DrawingTracer::clip:extents:[%f,%f,%f,%f]\n", cbox[0],cbox[1],cbox[2],cbox[3]);
+ }
+#endif
+}
+
+void DrawingTracer::clip_to_stroke_path(GfxState * state)
+{
+ if (!param.correct_text_visibility)
+ return;
+ // TODO cairo_stroke_to_path() ?
+}
+
+void DrawingTracer::save()
+{
+ if (!param.correct_text_visibility)
+ return;
+#if ENABLE_SVG
+ cairo_save(cairo);
+ if (DT_DEBUG)
+ printf("DrawingTracer::save\n");
+#endif
+}
+void DrawingTracer::restore()
+{
+ if (!param.correct_text_visibility)
+ return;
+#if ENABLE_SVG
+ cairo_restore(cairo);
+ if (DT_DEBUG)
+ printf("DrawingTracer::restore\n");
+#endif
+}
+
+void DrawingTracer::do_path(GfxState * state, GfxPath * path)
+{
+#if ENABLE_SVG
+ //copy from CairoOutputDev::doPath
+ GfxSubpath *subpath;
+ int i, j;
+ double x, y;
+ cairo_new_path(cairo);
+ if (DT_DEBUG)
+ printf("DrawingTracer::do_path:new_path\n");
+ for (i = 0; i < path->getNumSubpaths(); ++i) {
+ subpath = path->getSubpath(i);
+ if (subpath->getNumPoints() > 0) {
+ x = subpath->getX(0);
+ y = subpath->getY(0);
+ cairo_move_to(cairo, x, y);
+ if (DT_DEBUG)
+ printf("DrawingTracer::do_path:move_to[%f,%f]\n",x,y);
+ j = 1;
+ while (j < subpath->getNumPoints()) {
+ if (subpath->getCurve(j)) {
+ x = subpath->getX(j+2);
+ y = subpath->getY(j+2);
+ cairo_curve_to(cairo,
+ subpath->getX(j), subpath->getY(j),
+ subpath->getX(j+1), subpath->getY(j+1),
+ x, y);
+ if (DT_DEBUG)
+ printf("DrawingTracer::do_path:curve_to[%f,%f]\n",x,y);
+ j += 3;
+ } else {
+ x = subpath->getX(j);
+ y = subpath->getY(j);
+ cairo_line_to(cairo, x, y);
+ if (DT_DEBUG)
+ printf("DrawingTracer::do_path:line_to[%f,%f]\n",x,y);
+ ++j;
+ }
+ }
+ if (subpath->isClosed()) {
+ cairo_close_path (cairo);
+ if (DT_DEBUG)
+ printf("DrawingTracer::do_path:close\n");
+ }
+ }
+ }
+#endif
+}
+
+void DrawingTracer::stroke(GfxState * state)
+{
+#if ENABLE_SVG
+ if (!param.correct_text_visibility)
+ return;
+
+ if (DT_DEBUG)
+ printf("DrawingTracer::stroke\n");
+
+ cairo_set_line_width(cairo, state->getLineWidth());
+
+ // GfxPath is broken into steps, each step makes up a cairo path and its bbox is used for covering test.
+ // TODO
+ // 1. path steps that are not vertical or horizontal lines may still falsely "cover" many chars,
+ // can we slice those steps further?
+ // 2. if the line width is small, can we just ignore the path?
+ // 3. line join feature can't be retained. We use line-cap-square to minimize the problem that
+ // some chars actually covered by a line join are missed. However chars covered by a acute angle
+ // with line-join-miter may be still recognized as not covered.
+ cairo_set_line_cap(cairo, CAIRO_LINE_CAP_SQUARE);
+ GfxPath * path = state->getPath();
+ for (int i = 0; i < path->getNumSubpaths(); ++i) {
+ GfxSubpath * subpath = path->getSubpath(i);
+ if (subpath->getNumPoints() <= 0)
+ continue;
+ double x = subpath->getX(0);
+ double y = subpath->getY(0);
+ //p: loop cursor; j: next point index
+ int p =1, j = 1;
+ int n = subpath->getNumPoints();
+ while (p <= n) {
+ cairo_new_path(cairo);
+ cairo_move_to(cairo, x, y);
+ if (subpath->getCurve(j)) {
+ x = subpath->getX(j+2);
+ y = subpath->getY(j+2);
+ cairo_curve_to(cairo,
+ subpath->getX(j), subpath->getY(j),
+ subpath->getX(j+1), subpath->getY(j+1),
+ x, y);
+ p += 3;
+ } else {
+ x = subpath->getX(j);
+ y = subpath->getY(j);
+ cairo_line_to(cairo, x, y);
+ ++p;
+ }
+
+ if (DT_DEBUG)
+ printf("DrawingTracer::stroke:new box:\n");
+ double sbox[4];
+ cairo_stroke_extents(cairo, sbox, sbox + 1, sbox + 2, sbox + 3);
+ if (sbox[0] != sbox[2] && sbox[1] != sbox[3])
+ draw_non_char_bbox(state, sbox);
+ else if (DT_DEBUG)
+ printf("DrawingTracer::stroke:zero box!\n");
+
+ if (p == n)
+ {
+ if (subpath->isClosed())
+ j = 0; // if sub path is closed, go back to starting point
+ else
+ break;
+ }
+ else
+ j = p;
+ }
+ }
+#endif
+}
+
+void DrawingTracer::fill(GfxState * state, bool even_odd)
+{
+ if (!param.correct_text_visibility)
+ return;
+
+#if ENABLE_SVG
+ do_path(state, state->getPath());
+ //cairo_fill_extents don't take fill rule into account.
+ //cairo_set_fill_rule (cairo, even_odd? CAIRO_FILL_RULE_EVEN_ODD : CAIRO_FILL_RULE_WINDING);
+ double fbox[4];
+ cairo_fill_extents(cairo, fbox, fbox + 1, fbox + 2, fbox + 3);
+ draw_non_char_bbox(state, fbox);
+#endif
+}
+
+void DrawingTracer::draw_non_char_bbox(GfxState * state, double * bbox)
+{
+#if ENABLE_SVG
+ double cbox[4];
+ cairo_clip_extents(cairo, cbox, cbox + 1, cbox + 2, cbox + 3);
+ if(bbox_intersect(cbox, bbox, bbox))
+#endif
+ {
+ transform_bbox_by_ctm(bbox, state);
+ if (DT_DEBUG)
+ printf("DrawingTracer::draw_non_char_bbox:[%f,%f,%f,%f]\n", bbox[0],bbox[1],bbox[2],bbox[3]);
+ if (on_non_char_drawn)
+ on_non_char_drawn(bbox);
+ }
+}
+
+void DrawingTracer::draw_char_bbox(GfxState * state, double * bbox)
+{
+#if ENABLE_SVG
+ // Note: even if 4 corners of the char are all in or all out of the clip area,
+ // it could still be partially clipped.
+ // TODO better solution?
+ int pt_in = 0;
+ if (cairo_in_clip(cairo, bbox[0], bbox[1]))
+ ++pt_in;
+ if (cairo_in_clip(cairo, bbox[2], bbox[3]))
+ ++pt_in;
+ if (cairo_in_clip(cairo, bbox[2], bbox[1]))
+ ++pt_in;
+ if (cairo_in_clip(cairo, bbox[0], bbox[3]))
+ ++pt_in;
+
+ if (pt_in == 0)
+ {
+ transform_bbox_by_ctm(bbox);
+ if(on_char_clipped)
+ on_char_clipped(bbox, false);
+ }
+ else
+ {
+ if (pt_in < 4)
+ {
+ double cbox[4];
+ cairo_clip_extents(cairo, cbox, cbox + 1, cbox + 2, cbox + 3);
+ bbox_intersect(cbox, bbox, bbox);
+ }
+ transform_bbox_by_ctm(bbox);
+ if (pt_in < 4)
+ {
+ if(on_char_clipped)
+ on_char_clipped(bbox, true);
+ }
+ else
+ {
+ if (on_char_drawn)
+ on_char_drawn(bbox);
+ }
+ }
+#else
+ transform_bbox_by_ctm(bbox, state);
+ if (on_char_drawn)
+ on_char_drawn(bbox);
+#endif
+ if (DT_DEBUG)
+ printf("DrawingTracer::draw_char_bbox:[%f,%f,%f,%f]\n",bbox[0],bbox[1],bbox[2],bbox[3]);
+}
+
+void DrawingTracer::draw_image(GfxState *state)
+{
+ if (!param.correct_text_visibility)
+ return;
+ double bbox[4] {0, 0, 1, 1};
+ draw_non_char_bbox(state, bbox);
+}
+
+void DrawingTracer::draw_char(GfxState *state, double x, double y, double ax, double ay)
+{
+ if (!param.correct_text_visibility)
+ return;
+
+ Matrix tm, itm;
+ memcpy(tm.m, state->getTextMat(), sizeof(tm.m));
+
+ double cx = state->getCurX(), cy = state->getCurY(), fs = state->getFontSize(),
+ ry = state->getRise(), h = state->getHorizScaling();
+
+ //cx and cy has been transformed by text matrix, we need to reverse them.
+ tm.invertTo(&itm);
+ double char_cx, char_cy;
+ itm.transform(cx, cy, &char_cx, &char_cy);
+
+ //TODO Vertical? Currently vertical/type3 chars are treated as non-chars.
+ double char_m[6] {fs * h, 0, 0, fs, char_cx + x, char_cy + y + ry};
+
+ double final_m[6];
+ tm_multiply(final_m, tm.m, char_m);
+
+ auto font = state->getFont();
+ double bbox[4] {0, 0, ax, ay};
+ double desc = font->getDescent(), asc = font->getAscent();
+ if (font->getWMode() == 0)
+ {
+ bbox[1] += desc;
+ bbox[3] += asc;
+ }
+ else
+ {//TODO Vertical?
+ }
+ tm_transform_bbox(final_m, bbox);
+ draw_char_bbox(state, bbox);
+}
+
+
+void DrawingTracer::transform_bbox_by_ctm(double * bbox, GfxState * state)
+{
+#if ENABLE_SVG
+ cairo_matrix_t mat;
+ cairo_get_matrix(cairo, &mat);
+ double mat_a[6] {mat.xx, mat.yx, mat.xy, mat.yy, mat.x0, mat.y0};
+ tm_transform_bbox(mat_a, bbox);
+#else
+ tm_transform_bbox(state->getCTM(), bbox);
+#endif
+}
+
+} /* namespace pdf2htmlEX */
diff --git a/src/DrawingTracer.h b/src/DrawingTracer.h
new file mode 100644
index 0000000..2e3159d
--- /dev/null
+++ b/src/DrawingTracer.h
@@ -0,0 +1,79 @@
+/*
+ * DrawingTracer.h
+ *
+ * Created on: 2014-6-15
+ * Author: duanyao
+ */
+
+#ifndef DRAWINGTRACER_H__
+#define DRAWINGTRACER_H__
+
+#include <functional>
+
+#include <GfxState.h>
+
+#include "pdf2htmlEX-config.h"
+
+#if ENABLE_SVG
+#include <cairo.h>
+#endif
+
+#include "Param.h"
+
+namespace pdf2htmlEX
+{
+
+class DrawingTracer
+{
+public:
+ /*
+ * The callback to receive drawn event.
+ * bbox in device space.
+ */
+ // a non-char graphics is drawn
+ std::function<void(double * bbox)> on_non_char_drawn;
+ // a char is drawn in the clip area
+ std::function<void(double * bbox)> on_char_drawn;
+ // a char is drawn out of/partially in the clip area
+ std::function<void(double * bbox, bool patially)> on_char_clipped;
+
+ DrawingTracer(const Param & param);
+ virtual ~DrawingTracer();
+ void reset(GfxState * state);
+
+ /*
+ * A character is drawing
+ * x, y: glyph-drawing position, in PDF text object space.
+ * ax, ay: glyph advance, in glyph space.
+ */
+ void draw_char(GfxState * state, double x, double y, double ax, double ay);
+ /*
+ * An image is drawing
+ */
+ void draw_image(GfxState * state);
+ void update_ctm(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32);
+ void clip(GfxState * state, bool even_odd = false);
+ void clip_to_stroke_path(GfxState * state);
+ void fill(GfxState * state, bool even_odd = false);
+ void stroke(GfxState * state);
+ void save();
+ void restore();
+
+private:
+ void finish();
+ // Following methods operate in user space (just before CTM is applied)
+ void do_path(GfxState * state, GfxPath * path);
+ void draw_non_char_bbox(GfxState * state, double * bbox);
+ void draw_char_bbox(GfxState * state, double * bbox);
+ // If cairo is available, parameter state is ignored
+ void transform_bbox_by_ctm(double * bbox, GfxState * state = nullptr);
+
+ const Param & param;
+
+#if ENABLE_SVG
+ cairo_t * cairo;
+#endif
+};
+
+} /* namespace pdf2htmlEX */
+#endif /* DRAWINGTRACER_H__ */
diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h
new file mode 100644
index 0000000..18e395d
--- /dev/null
+++ b/src/HTMLRenderer/HTMLRenderer.h
@@ -0,0 +1,348 @@
+/*
+ * HTMLRenderer.h
+ *
+ * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#ifndef HTMLRENDERER_H_
+#define HTMLRENDERER_H_
+
+#include <unordered_map>
+#include <cstdint>
+#include <fstream>
+#include <memory>
+
+#include <OutputDev.h>
+#include <GfxState.h>
+#include <Stream.h>
+#include <PDFDoc.h>
+#include <goo/gtypes.h>
+#include <Object.h>
+#include <GfxFont.h>
+#include <Annot.h>
+
+// for form.cc
+#include <Page.h>
+#include <Form.h>
+
+#include "pdf2htmlEX-config.h"
+
+#include "Param.h"
+#include "Preprocessor.h"
+#include "StringFormatter.h"
+#include "TmpFiles.h"
+#include "Color.h"
+#include "StateManager.h"
+#include "HTMLTextPage.h"
+
+#include "BackgroundRenderer/BackgroundRenderer.h"
+#include "CoveredTextDetector.h"
+#include "DrawingTracer.h"
+
+#include "util/const.h"
+#include "util/misc.h"
+
+
+namespace pdf2htmlEX {
+
+struct HTMLRenderer : OutputDev
+{
+ HTMLRenderer(const Param & param);
+ virtual ~HTMLRenderer();
+
+ void process(PDFDoc * doc);
+
+ ////////////////////////////////////////////////////
+ // OutputDev interface
+ ////////////////////////////////////////////////////
+
+ // Does this device use upside-down coordinates?
+ // (Upside-down means (0,0) is the top left corner of the page.)
+ virtual GBool upsideDown() { return gFalse; }
+
+ // Does this device use drawChar() or drawString()?
+ virtual GBool useDrawChar() { return gFalse; }
+
+ // Does this device use functionShadedFill(), axialShadedFill(), and
+ // radialShadedFill()? If this returns false, these shaded fills
+ // will be reduced to a series of other drawing operations.
+ virtual GBool useShadedFills(int type) { return (type == 2) ? gTrue: gFalse; }
+
+ // Does this device use beginType3Char/endType3Char? Otherwise,
+ // text in Type 3 fonts will be drawn with drawChar/drawString.
+ virtual GBool interpretType3Chars() { return gFalse; }
+
+ // Does this device need non-text content?
+ virtual GBool needNonText() { return (param.process_nontext) ? gTrue: gFalse; }
+
+ // Does this device need to clip pages to the crop box even when the
+ // box is the crop box?
+ virtual GBool needClipToCropBox() { return gTrue; }
+
+ virtual void setDefaultCTM(double *ctm);
+
+ // Start a page.
+ virtual void startPage(int pageNum, GfxState *state, XRef * xref);
+
+ // End a page.
+ virtual void endPage();
+
+ /*
+ * To optimize false alarms
+ * We just mark as changed, and recheck if they have been changed when we are about to output a new string
+ */
+
+ virtual void restoreState(GfxState * state);
+
+ virtual void saveState(GfxState *state);
+
+ virtual void updateAll(GfxState * state);
+
+ virtual void updateRise(GfxState * state);
+ virtual void updateTextPos(GfxState * state);
+ virtual void updateTextShift(GfxState * state, double shift);
+
+ virtual void updateFont(GfxState * state);
+ virtual void updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32);
+ virtual void updateTextMat(GfxState * state);
+ virtual void updateHorizScaling(GfxState * state);
+
+ virtual void updateCharSpace(GfxState * state);
+ virtual void updateWordSpace(GfxState * state);
+
+ virtual void updateRender(GfxState * state);
+
+ virtual void updateFillColorSpace(GfxState * state);
+ virtual void updateStrokeColorSpace(GfxState * state);
+ virtual void updateFillColor(GfxState * state);
+ virtual void updateStrokeColor(GfxState * state);
+
+
+ /*
+ * Rendering
+ */
+
+ virtual void clip(GfxState * state);
+ virtual void eoClip(GfxState * state);
+ virtual void clipToStrokePath(GfxState * state);
+
+ virtual void drawString(GfxState * state, GooString * s);
+
+ virtual void drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg);
+
+ virtual void drawSoftMaskedImage(GfxState *state, Object *ref, Stream *str,
+ int width, int height,
+ GfxImageColorMap *colorMap,
+ GBool interpolate,
+ Stream *maskStr,
+ int maskWidth, int maskHeight,
+ GfxImageColorMap *maskColorMap,
+ GBool maskInterpolate);
+
+ virtual void stroke(GfxState *state);
+ virtual void fill(GfxState *state);
+ virtual void eoFill(GfxState *state);
+ virtual GBool axialShadedFill(GfxState *state, GfxAxialShading *shading, double tMin, double tMax);
+
+ virtual void processLink(AnnotLink * al);
+
+ /*
+ * Covered text handling.
+ */
+ // Is a char (actually a glyph) covered by non-char's. Index in drawing order in current page.
+ // Does not fail on out-of-bound conditions, but return false.
+ bool is_char_covered(int index);
+ // Currently drawn char (glyph) count in current page.
+ int get_char_count() { return (int)covered_text_detector.get_chars_covered().size(); }
+
+protected:
+ ////////////////////////////////////////////////////
+ // misc
+ ////////////////////////////////////////////////////
+ void pre_process(PDFDoc * doc);
+ void post_process(void);
+
+ void process_outline(void);
+ void process_outline_items(GooList * items);
+
+ void process_form(std::ofstream & out);
+
+ void set_stream_flags (std::ostream & out);
+
+ void dump_css(void);
+
+ // convert a LinkAction to a string that our Javascript code can understand
+ std::string get_linkaction_str(LinkAction *, std::string & detail);
+
+ ////////////////////////////////////////////////////
+ /*
+ * manage fonts
+ *
+ * In PDF: (install_*)
+ * embedded font: fonts embedded in PDF
+ * external font: fonts that have only names provided in PDF, the viewer should find a local font to match with
+ *
+ * In HTML: (export_*)
+ * remote font: to be retrieved from the web server
+ * remote default font: fallback styles for invalid fonts
+ * local font: to be substituted with a local (client side) font
+ */
+ ////////////////////////////////////////////////////
+ std::string dump_embedded_font(GfxFont * font, FontInfo & info);
+ std::string dump_type3_font(GfxFont * font, FontInfo & info);
+ void embed_font(const std::string & filepath, GfxFont * font, FontInfo & info, bool get_metric_only = false);
+ const FontInfo * install_font(GfxFont * font);
+ void install_embedded_font(GfxFont * font, FontInfo & info);
+ void install_external_font (GfxFont * font, FontInfo & info);
+ void export_remote_font(const FontInfo & info, const std::string & suffix, GfxFont * font);
+ void export_remote_default_font(long long fn_id);
+ void export_local_font(const FontInfo & info, GfxFont * font, const std::string & original_font_name, const std::string & cssfont);
+
+ // depending on --embed***, to embed the content or add a link to it
+ // "type": specify the file type, usually it's the suffix, in which case this parameter could be ""
+ // "copy": indicates whether to copy the file into dest_dir, if not embedded
+ void embed_file(std::ostream & out, const std::string & path, const std::string & type, bool copy);
+
+ ////////////////////////////////////////////////////
+ // state tracking
+ ////////////////////////////////////////////////////
+ // reset all states
+ void reset_state();
+ // reset all ***_changed flags
+ void reset_state_change();
+ // check updated states, and determine new_line_status
+ // make sure this function can be called several times consecutively without problem
+ void check_state_change(GfxState * state);
+ // prepare the line context, (close old tags, open new tags)
+ // make sure the current HTML style consistent with PDF
+ void prepare_text_line(GfxState * state);
+
+ ////////////////////////////////////////////////////
+ // PDF stuffs
+ ////////////////////////////////////////////////////
+
+ XRef * xref;
+ PDFDoc * cur_doc;
+ Catalog * cur_catalog;
+ int pageNum;
+
+ double default_ctm[6];
+
+ /*
+ * The content of each page is first scaled with factor1 (>=1), then scale back with factor2(<=1)
+ *
+ * factor1 is use to multiplied with all metrics (height/width/font-size...), in order to improve accuracy
+ * factor2 is applied with css transform, and is exposed to Javascript
+ *
+ * factor1 & factor 2 are determined according to zoom and font-size-multiplier
+ *
+ */
+ double text_zoom_factor (void) const { return text_scale_factor1 * text_scale_factor2; }
+ double text_scale_factor1;
+ double text_scale_factor2;
+
+ // 1px on screen should be printed as print_scale()pt
+ double print_scale (void) const { return 96.0 / DEFAULT_DPI / text_zoom_factor(); }
+
+
+ const Param & param;
+
+ ////////////////////////////////////////////////////
+ // PDF states
+ ////////////////////////////////////////////////////
+ // track the original (unscaled) values to determine scaling and merge lines
+ // current position
+ double cur_tx, cur_ty; // real text position, in text coords
+ double cur_font_size;
+ // this is CTM * TextMAT in PDF
+ // as we'll calculate the position of the origin separately
+ double cur_text_tm[6]; // unscaled
+
+ bool all_changed;
+ bool ctm_changed;
+ bool rise_changed;
+ bool font_changed;
+ bool text_pos_changed;
+ bool text_mat_changed;
+ bool fill_color_changed;
+ bool hori_scale_changed;
+ bool word_space_changed;
+ bool letter_space_changed;
+ bool stroke_color_changed;
+ bool clip_changed;
+
+ ////////////////////////////////////////////////////
+ // HTML states
+ ////////////////////////////////////////////////////
+
+ // optimize for web
+ // we try to render the final font size directly
+ // to reduce the effect of ctm as much as possible
+
+ // the actual tm used is `real tm in PDF` scaled by 1/draw_text_scale,
+ // so everything rendered should be multiplied by draw_text_scale
+ double draw_text_scale;
+
+ // the position of next char, in text coords
+ // this is actual position (in HTML), which might be different from cur_tx/ty (in PDF)
+ // also keep in mind that they are not the final position, as they will be transform by CTM (also true for cur_tx/ty)
+ double draw_tx, draw_ty;
+
+
+ ////////////////////////////////////////////////////
+ // styles & resources
+ ////////////////////////////////////////////////////
+ // managers store values actually used in HTML (i.e. scaled)
+ std::unordered_map<long long, FontInfo> font_info_map;
+ AllStateManager all_manager;
+ HTMLTextState cur_text_state;
+ HTMLLineState cur_line_state;
+ HTMLClipState cur_clip_state;
+
+ HTMLTextPage html_text_page;
+
+ enum NewLineState
+ {
+ NLS_NONE,
+ NLS_NEWSTATE,
+ NLS_NEWLINE,
+ NLS_NEWCLIP
+ } new_line_state;
+
+ // for font reencoding
+ std::vector<int32_t> cur_mapping;
+ std::vector<char*> cur_mapping2;
+ std::vector<int> width_list; // width of each char
+
+ Preprocessor preprocessor;
+
+ // manage temporary files
+ TmpFiles tmp_files;
+
+ // for string formatting
+ StringFormatter str_fmt;
+
+ // render background image
+ friend class SplashBackgroundRenderer; // ugly!
+#if ENABLE_SVG
+ friend class CairoBackgroundRenderer; // ugly!
+#endif
+
+ std::unique_ptr<BackgroundRenderer> bg_renderer, fallback_bg_renderer;
+
+ struct {
+ std::ofstream fs;
+ std::string path;
+ } f_outline, f_pages, f_css;
+ std::ofstream * f_curpage;
+ std::string cur_page_filename;
+
+ static const std::string MANIFEST_FILENAME;
+
+ CoveredTextDetector covered_text_detector;
+ DrawingTracer tracer;
+};
+
+} //namespace pdf2htmlEX
+
+#endif /* HTMLRENDERER_H_ */
diff --git a/src/HTMLRenderer/draw.cc b/src/HTMLRenderer/draw.cc
new file mode 100644
index 0000000..6529418
--- /dev/null
+++ b/src/HTMLRenderer/draw.cc
@@ -0,0 +1,65 @@
+/*
+ * Draw.cc
+ *
+ * Handling path drawing
+ *
+ * by WangLu
+ * 2012.10.01
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <sstream>
+#include <vector>
+#include <iostream>
+
+#include "HTMLRenderer.h"
+#include "util/misc.h"
+#include "util/math.h"
+#include "util/namespace.h"
+
+namespace pdf2htmlEX {
+
+using std::swap;
+using std::min;
+using std::max;
+using std::acos;
+using std::asin;
+using std::ostringstream;
+using std::sqrt;
+using std::vector;
+using std::ostream;
+
+void HTMLRenderer::restoreState(GfxState * state)
+{
+ updateAll(state);
+ tracer.restore();
+}
+
+void HTMLRenderer::saveState(GfxState *state)
+{
+ tracer.save();
+}
+
+void HTMLRenderer::stroke(GfxState * state)
+{
+ tracer.stroke(state);
+}
+
+void HTMLRenderer::fill(GfxState * state)
+{
+ tracer.fill(state);
+}
+
+void HTMLRenderer::eoFill(GfxState * state)
+{
+ tracer.fill(state, true);
+}
+
+GBool HTMLRenderer::axialShadedFill(GfxState *state, GfxAxialShading *shading, double tMin, double tMax)
+{
+ tracer.fill(state); //TODO correct?
+ return true;
+}
+
+} // namespace pdf2htmlEX
diff --git a/src/HTMLRenderer/font.cc b/src/HTMLRenderer/font.cc
new file mode 100644
index 0000000..10ff215
--- /dev/null
+++ b/src/HTMLRenderer/font.cc
@@ -0,0 +1,1089 @@
+/*
+ * font.cc
+ *
+ * Font processing
+ *
+ * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#include <iostream>
+#include <cmath>
+#include <algorithm>
+#include <sstream>
+#include <cctype>
+#include <unordered_set>
+
+#include <GlobalParams.h>
+#include <fofi/FoFiTrueType.h>
+#include <CharCodeToUnicode.h>
+
+#include "Param.h"
+#include "HTMLRenderer.h"
+#include "Base64Stream.h"
+
+#include "pdf2htmlEX-config.h"
+
+#include "util/namespace.h"
+#include "util/math.h"
+#include "util/misc.h"
+#include "util/ffw.h"
+#include "util/path.h"
+#include "util/unicode.h"
+#include "util/css_const.h"
+
+#if ENABLE_SVG
+#include <cairo.h>
+#include <cairo-ft.h>
+#include <cairo-svg.h>
+#include "CairoFontEngine.h"
+#include "CairoOutputDev.h"
+#include <Gfx.h>
+#endif
+
+namespace pdf2htmlEX {
+
+using std::min;
+using std::unordered_set;
+using std::cerr;
+using std::endl;
+
+string HTMLRenderer::dump_embedded_font (GfxFont * font, FontInfo & info)
+{
+ if(info.is_type3)
+ return dump_type3_font(font, info);
+
+ Object obj, obj1, obj2;
+ Object font_obj, font_obj2, fontdesc_obj;
+ string suffix;
+ string filepath;
+
+ long long fn_id = info.id;
+
+ try
+ {
+ // inspired by mupdf
+ string subtype;
+
+ auto * id = font->getID();
+
+ Object ref_obj;
+ ref_obj.initRef(id->num, id->gen);
+ ref_obj.fetch(xref, &font_obj);
+ ref_obj.free();
+
+ if(!font_obj.isDict())
+ {
+ cerr << "Font object is not a dictionary" << endl;
+ throw 0;
+ }
+
+ Dict * dict = font_obj.getDict();
+ if(dict->lookup("DescendantFonts", &font_obj2)->isArray())
+ {
+ if(font_obj2.arrayGetLength() == 0)
+ {
+ cerr << "Warning: empty DescendantFonts array" << endl;
+ }
+ else
+ {
+ if(font_obj2.arrayGetLength() > 1)
+ cerr << "TODO: multiple entries in DescendantFonts array" << endl;
+
+ if(font_obj2.arrayGet(0, &obj2)->isDict())
+ {
+ dict = obj2.getDict();
+ }
+ }
+ }
+
+ if(!dict->lookup("FontDescriptor", &fontdesc_obj)->isDict())
+ {
+ cerr << "Cannot find FontDescriptor " << endl;
+ throw 0;
+ }
+
+ dict = fontdesc_obj.getDict();
+
+ if(dict->lookup("FontFile3", &obj)->isStream())
+ {
+ if(obj.streamGetDict()->lookup("Subtype", &obj1)->isName())
+ {
+ subtype = obj1.getName();
+ if(subtype == "Type1C")
+ {
+ suffix = ".cff";
+ }
+ else if (subtype == "CIDFontType0C")
+ {
+ suffix = ".cid";
+ }
+ else if (subtype == "OpenType")
+ {
+ suffix = ".otf";
+ }
+ else
+ {
+ cerr << "Unknown subtype: " << subtype << endl;
+ throw 0;
+ }
+ }
+ else
+ {
+ cerr << "Invalid subtype in font descriptor" << endl;
+ throw 0;
+ }
+ }
+ else if (dict->lookup("FontFile2", &obj)->isStream())
+ {
+ suffix = ".ttf";
+ }
+ else if (dict->lookup("FontFile", &obj)->isStream())
+ {
+ suffix = ".pfa";
+ }
+ else
+ {
+ cerr << "Cannot find FontFile for dump" << endl;
+ throw 0;
+ }
+
+ if(suffix == "")
+ {
+ cerr << "Font type unrecognized" << endl;
+ throw 0;
+ }
+
+ obj.streamReset();
+
+ filepath = (char*)str_fmt("%s/f%llx%s", param.tmp_dir.c_str(), fn_id, suffix.c_str());
+ tmp_files.add(filepath);
+
+ ofstream outf(filepath, ofstream::binary);
+ if(!outf)
+ throw string("Cannot open file ") + filepath + " for writing";
+
+ char buf[1024];
+ int len;
+ while((len = obj.streamGetChars(1024, (Guchar*)buf)) > 0)
+ {
+ outf.write(buf, len);
+ }
+ obj.streamClose();
+ }
+ catch(int)
+ {
+ cerr << "Something wrong when trying to dump font " << hex << fn_id << dec << endl;
+ }
+
+ obj2.free();
+ obj1.free();
+ obj.free();
+
+ fontdesc_obj.free();
+ font_obj2.free();
+ font_obj.free();
+
+ return filepath;
+}
+
+string HTMLRenderer::dump_type3_font (GfxFont * font, FontInfo & info)
+{
+ assert(info.is_type3);
+
+#if ENABLE_SVG
+ long long fn_id = info.id;
+
+ FT_Library ft_lib;
+ FT_Init_FreeType(&ft_lib);
+ CairoFontEngine font_engine(ft_lib);
+ auto * cur_font = font_engine.getFont(font, cur_doc, true, xref);
+ auto used_map = preprocessor.get_code_map(hash_ref(font->getID()));
+
+ //calculate transformed metrics
+ double * font_bbox = font->getFontBBox();
+ double * font_matrix = font->getFontMatrix();
+ double transformed_bbox[4];
+ memcpy(transformed_bbox, font_bbox, 4 * sizeof(double));
+ /*
+ // add the origin to the bbox
+ if(transformed_bbox[0] > 0) transformed_bbox[0] = 0;
+ if(transformed_bbox[1] > 0) transformed_bbox[1] = 0;
+ if(transformed_bbox[2] < 0) transformed_bbox[2] = 0;
+ if(transformed_bbox[3] < 0) transformed_bbox[3] = 0;
+ */
+ tm_transform_bbox(font_matrix, transformed_bbox);
+ double transformed_bbox_width = transformed_bbox[2] - transformed_bbox[0];
+ double transformed_bbox_height = transformed_bbox[3] - transformed_bbox[1];
+ info.font_size_scale = std::max(transformed_bbox_width, transformed_bbox_height);
+
+ // we want the glyphs is rendered in a box of size around GLYPH_DUMP_EM_SIZE x GLYPH_DUMP_EM_SIZE
+ // for rectangles, the longer edge should be GLYPH_DUMP_EM_SIZE
+ const double GLYPH_DUMP_EM_SIZE = 100.0;
+ double scale = GLYPH_DUMP_EM_SIZE / info.font_size_scale;
+
+ // we choose ttf as it does not use char names
+ // or actually we don't use char names for ttf (see embed_font)
+ ffw_new_font();
+ // dump each glyph into svg and combine them
+ for(int code = 0; code < 256; ++code)
+ {
+ if(!used_map[code]) continue;
+
+ cairo_surface_t * surface = nullptr;
+
+ string glyph_filename = (char*)str_fmt("%s/f%llx-%x.svg", param.tmp_dir.c_str(), fn_id, code);
+ tmp_files.add(glyph_filename);
+
+ surface = cairo_svg_surface_create(glyph_filename.c_str(), transformed_bbox_width * scale, transformed_bbox_height * scale);
+
+ cairo_svg_surface_restrict_to_version(surface, CAIRO_SVG_VERSION_1_2);
+ cairo_surface_set_fallback_resolution(surface, param.h_dpi, param.v_dpi);
+ cairo_t * cr = cairo_create(surface);
+
+ // track the position of the origin
+ double ox, oy;
+ ox = oy = 0.0;
+
+ auto glyph_width = ((Gfx8BitFont*)font)->getWidth(code);
+
+#if 1
+ {
+ // pain the glyph
+ cairo_set_font_face(cr, cur_font->getFontFace());
+
+ cairo_matrix_t m1, m2, m3;
+ // set up m1
+ // m1 shift the bottom-left corner of the glyph bbox to the origin
+ // also set font size to scale
+ cairo_matrix_init_translate(&m1, -transformed_bbox[0], transformed_bbox[1]);
+ cairo_matrix_init_scale(&m2, scale, scale);
+ cairo_matrix_multiply(&m1, &m1, &m2);
+ cairo_set_font_matrix(cr, &m1);
+
+ cairo_glyph_t glyph;
+ glyph.index = cur_font->getGlyph(code, nullptr, 0);
+ glyph.x = 0;
+ glyph.y = GLYPH_DUMP_EM_SIZE;
+ cairo_show_glyphs(cr, &glyph, 1);
+
+
+ // apply the type 3 font's font matrix before m1
+ // such that we got the mapping from type 3 font space to user space, then we will be able to calculate mapped position for ox,oy and glyph_width
+ cairo_matrix_init(&m2, font_matrix[0], font_matrix[1], font_matrix[2], font_matrix[3], font_matrix[4], font_matrix[5]);
+ cairo_matrix_init_scale(&m3, 1, -1);
+ cairo_matrix_multiply(&m2, &m2, &m3);
+ cairo_matrix_multiply(&m2, &m2, &m1);
+
+ cairo_matrix_transform_point(&m2, &ox, &oy);
+ double dummy = 0;
+ cairo_matrix_transform_distance(&m2, &glyph_width, &dummy);
+ }
+#else
+ {
+ // manually draw the char to get the metrics
+ // adapted from _render_type3_glyph of poppler
+ cairo_matrix_t ctm, m, m1;
+ cairo_matrix_init_identity(&ctm);
+
+ // apply font-matrix
+ cairo_matrix_init(&m, font_matrix[0], font_matrix[1], font_matrix[2], font_matrix[3], font_matrix[4], font_matrix[5]);
+ cairo_matrix_multiply(&ctm, &ctm, &m);
+
+ // shift origin
+ cairo_matrix_init_translate(&m1, -transformed_bbox[0], -transformed_bbox[1]);
+ cairo_matrix_multiply(&ctm, &ctm, &m1);
+
+ // make it upside down since the difference between the glyph coordination and cairo coordination
+ cairo_matrix_init_scale(&m1, 1, -1);
+ cairo_matrix_multiply(&ctm, &ctm, &m1);
+ // save m*m1 to m1 for later use
+ cairo_matrix_multiply(&m1, &m, &m1);
+
+ // shift up to the bounding box
+ cairo_matrix_init_translate(&m, 0.0, transformed_bbox_height);
+ cairo_matrix_multiply(&ctm, &ctm, &m);
+
+ // scale up
+ cairo_matrix_init_scale(&m, scale, scale);
+ cairo_matrix_multiply(&ctm, &ctm, &m);
+
+ // set ctm
+ cairo_set_matrix(cr, &ctm);
+
+ // calculate the position of origin
+ cairo_matrix_transform_point(&ctm, &ox, &oy);
+ oy -= transformed_bbox_height * scale;
+ // calculate glyph width
+ double dummy = 0;
+ cairo_matrix_transform_distance(&ctm, &glyph_width, &dummy);
+
+ // draw the glyph
+ auto output_dev = new CairoOutputDev();
+ output_dev->setCairo(cr);
+ output_dev->setPrinting(true);
+
+ PDFRectangle box;
+ box.x1 = font_bbox[0];
+ box.y1 = font_bbox[1];
+ box.x2 = font_bbox[2];
+ box.y2 = font_bbox[3];
+ auto gfx = new Gfx(cur_doc, output_dev,
+ ((Gfx8BitFont*)font)->getResources(),
+ &box, nullptr);
+ output_dev->startDoc(cur_doc, &font_engine);
+ output_dev->startPage(1, gfx->getState(), gfx->getXRef());
+ output_dev->setInType3Char(gTrue);
+ auto char_procs = ((Gfx8BitFont*)font)->getCharProcs();
+ Object char_proc_obj;
+ auto glyph_index = cur_font->getGlyph(code, nullptr, 0);
+ gfx->display(char_procs->getVal(glyph_index, &char_proc_obj));
+
+ char_proc_obj.free();
+ delete gfx;
+ delete output_dev;
+ }
+#endif
+
+ {
+ auto status = cairo_status(cr);
+ cairo_destroy(cr);
+ if(status)
+ throw string("Cairo error: ") + cairo_status_to_string(status);
+ }
+ cairo_surface_finish(surface);
+ {
+ auto status = cairo_surface_status(surface);
+ cairo_surface_destroy(surface);
+ surface = nullptr;
+ if(status)
+ throw string("Error in cairo: ") + cairo_status_to_string(status);
+ }
+
+ ffw_import_svg_glyph(code, glyph_filename.c_str(), ox / GLYPH_DUMP_EM_SIZE, -oy / GLYPH_DUMP_EM_SIZE, glyph_width / GLYPH_DUMP_EM_SIZE);
+ }
+
+ string font_filename = (char*)str_fmt("%s/f%llx.ttf", param.tmp_dir.c_str(), fn_id);
+ tmp_files.add(font_filename);
+ ffw_save(font_filename.c_str());
+ ffw_close();
+
+ return font_filename;
+#else
+ return "";
+#endif
+}
+
+void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo & info, bool get_metric_only)
+{
+ if(param.debug)
+ {
+ cerr << "Embed font: " << filepath << " " << info.id << endl;
+ }
+
+ ffw_load_font(filepath.c_str());
+ ffw_prepare_font();
+
+ if(param.debug)
+ {
+ auto fn = str_fmt("%s/__raw_font_%llx%s", param.tmp_dir.c_str(), info.id, get_suffix(filepath).c_str());
+ tmp_files.add((char*)fn);
+ ofstream((char*)fn, ofstream::binary) << ifstream(filepath).rdbuf();
+ }
+
+ int * code2GID = nullptr;
+ int code2GID_len = 0;
+ int maxcode = 0;
+
+ Gfx8BitFont * font_8bit = nullptr;
+ GfxCIDFont * font_cid = nullptr;
+
+ string suffix = get_suffix(filepath);
+ for(auto & c : suffix)
+ c = tolower(c);
+
+ /*
+ * if parm->tounicode is 0, try the provided tounicode map first
+ */
+ info.use_tounicode = (param.tounicode >= 0);
+ bool has_space = false;
+
+ const char * used_map = nullptr;
+
+ info.em_size = ffw_get_em_size();
+
+ if(param.debug)
+ {
+ cerr << "em size: " << info.em_size << endl;
+ }
+
+ info.space_width = 0;
+
+ if(!font->isCIDFont())
+ {
+ font_8bit = dynamic_cast<Gfx8BitFont*>(font);
+ }
+ else
+ {
+ font_cid = dynamic_cast<GfxCIDFont*>(font);
+ }
+
+ if(get_metric_only)
+ {
+ ffw_fix_metric();
+ ffw_get_metric(&info.ascent, &info.descent);
+ ffw_close();
+ return;
+ }
+
+ used_map = preprocessor.get_code_map(hash_ref(font->getID()));
+
+ /*
+ * Step 1
+ * dump the font file directly from the font descriptor and put the glyphs into the correct slots *
+ *
+ * for 8bit + nonTrueType
+ * re-encoding the font by glyph names
+ *
+ * for 8bit + TrueType
+ * sort the glpyhs as the original order, and load the code2GID table
+ * later we will map GID (instead of char code) to Unicode
+ *
+ * for CID + nonTrueType
+ * Flatten the font
+ *
+ * for CID Truetype
+ * same as 8bitTrueType, except for that we have to check 65536 charcodes
+ * use the embedded code2GID table if there is, otherwise use the one in the font
+ */
+ if(font_8bit)
+ {
+ maxcode = 0xff;
+ if(is_truetype_suffix(suffix))
+ {
+ if(info.is_type3)
+ {
+ /*
+ * Type 3 fonts are saved and converted into ttf fonts
+ * encoded based on code points instead of GID
+ *
+ * I thought code2GID would work but it never works, and I don't know why
+ * Anyway we can disable code2GID such that the following procedure will be working based on code points instead of GID
+ */
+ }
+ else
+ {
+ ffw_reencode_glyph_order();
+ if(FoFiTrueType * fftt = FoFiTrueType::load((char*)filepath.c_str()))
+ {
+ code2GID = font_8bit->getCodeToGIDMap(fftt);
+ code2GID_len = 256;
+ delete fftt;
+ }
+ }
+ }
+ else
+ {
+ // move the slot such that it's consistent with the encoding seen in PDF
+ unordered_set<string> nameset;
+ bool name_conflict_warned = false;
+
+ std::fill(cur_mapping2.begin(), cur_mapping2.end(), (char*)nullptr);
+
+ for(int i = 0; i < 256; ++i)
+ {
+ if(!used_map[i]) continue;
+
+ auto cn = font_8bit->getCharName(i);
+ if(cn == nullptr)
+ {
+ continue;
+ }
+ else
+ {
+ if(nameset.insert(string(cn)).second)
+ {
+ cur_mapping2[i] = cn;
+ }
+ else
+ {
+ if(!name_conflict_warned)
+ {
+ name_conflict_warned = true;
+ //TODO: may be resolved using advanced font properties?
+ cerr << "Warning: encoding conflict detected in font: " << hex << info.id << dec << endl;
+ }
+ }
+ }
+ }
+
+ ffw_reencode_raw2(cur_mapping2.data(), 256, 0);
+ }
+ }
+ else
+ {
+ maxcode = 0xffff;
+
+ if(is_truetype_suffix(suffix))
+ {
+ ffw_reencode_glyph_order();
+
+ GfxCIDFont * _font = dynamic_cast<GfxCIDFont*>(font);
+
+ // To locate CID2GID for the font
+ // as in CairoFontEngine.cc
+ if((code2GID = _font->getCIDToGID()))
+ {
+ // use the mapping stored in _font
+ code2GID_len = _font->getCIDToGIDLen();
+ }
+ else
+ {
+ // use the mapping stored in the file
+ if(FoFiTrueType * fftt = FoFiTrueType::load((char*)filepath.c_str()))
+ {
+ code2GID = _font->getCodeToGIDMap(fftt, &code2GID_len);
+ delete fftt;
+ }
+ }
+ }
+ else
+ {
+ // TODO: add an option to load the table?
+ ffw_cidflatten();
+ }
+ }
+
+ /*
+ * Step 2
+ * - map charcode (or GID for CID truetype)
+ *
+ * -> Always map to Unicode for 8bit TrueType fonts and CID fonts
+ *
+ * -> For 8bit nonTruetype fonts:
+ * Try to calculate the correct Unicode value from the glyph names, when collision is detected in ToUnicode Map
+ *
+ * - Fill in the width_list, and set widths accordingly
+ */
+
+
+ {
+ string map_filename;
+ ofstream map_outf;
+ if(param.debug)
+ {
+ map_filename = (char*)str_fmt("%s/f%llx.map", param.tmp_dir.c_str(), info.id);
+ tmp_files.add(map_filename);
+ map_outf.open(map_filename);
+ }
+
+ unordered_set<int> codeset;
+ bool name_conflict_warned = false;
+
+ auto ctu = font->getToUnicode();
+ std::fill(cur_mapping.begin(), cur_mapping.end(), -1);
+ std::fill(width_list.begin(), width_list.end(), -1);
+
+ if(code2GID)
+ maxcode = min<int>(maxcode, code2GID_len - 1);
+
+ bool is_truetype = is_truetype_suffix(suffix);
+ int max_key = maxcode;
+ /*
+ * Traverse all possible codes
+ */
+ bool retried = false; // avoid infinite loop
+ for(int cur_code = 0; cur_code <= maxcode; ++cur_code)
+ {
+ if(!used_map[cur_code])
+ continue;
+
+ /*
+ * Skip glyphs without names (only for non-ttf fonts)
+ */
+ if(!is_truetype && (font_8bit != nullptr)
+ && (font_8bit->getCharName(cur_code) == nullptr))
+ {
+ continue;
+ }
+
+ int mapped_code = cur_code;
+ if(code2GID)
+ {
+ // for fonts with GID (e.g. TTF) we need to map GIDs instead of codes
+ if((mapped_code = code2GID[cur_code]) == 0) continue;
+ }
+
+ if(mapped_code > max_key)
+ max_key = mapped_code;
+
+ Unicode u, *pu=&u;
+ if(info.use_tounicode)
+ {
+ int n = ctu ? (ctu->mapToUnicode(cur_code, &pu)) : 0;
+ u = check_unicode(pu, n, cur_code, font);
+ }
+ else
+ {
+ u = unicode_from_font(cur_code, font);
+ }
+
+ if(codeset.insert(u).second)
+ {
+ cur_mapping[mapped_code] = u;
+ }
+ else
+ {
+ // collision detected
+ if(param.tounicode == 0)
+ {
+ // in auto mode, just drop the tounicode map
+ if(!retried)
+ {
+ cerr << "ToUnicode CMap is not valid and got dropped for font: " << hex << info.id << dec << endl;
+ retried = true;
+ codeset.clear();
+ info.use_tounicode = false;
+ std::fill(cur_mapping.begin(), cur_mapping.end(), -1);
+ std::fill(width_list.begin(), width_list.end(), -1);
+ cur_code = -1;
+ if(param.debug)
+ {
+ map_outf.close();
+ map_outf.open(map_filename);
+ }
+ continue;
+ }
+ }
+ if(!name_conflict_warned)
+ {
+ name_conflict_warned = true;
+ //TODO: may be resolved using advanced font properties?
+ cerr << "Warning: encoding confliction detected in font: " << hex << info.id << dec << endl;
+ }
+ }
+
+ {
+ double cur_width = 0;
+ if(font_8bit)
+ {
+ cur_width = font_8bit->getWidth(cur_code);
+ }
+ else
+ {
+ char buf[2];
+ buf[0] = (cur_code >> 8) & 0xff;
+ buf[1] = (cur_code & 0xff);
+ cur_width = font_cid->getWidth(buf, 2) ;
+ }
+
+ cur_width /= info.font_size_scale;
+
+ if(u == ' ')
+ {
+ /*
+ * Internet Explorer will ignore `word-spacing` if
+ * the width of the 'space' glyph is 0
+ *
+ * space_width==0 often means no spaces are used in the PDF
+ * so setting it to be 0.001 should be safe
+ */
+ if(equal(cur_width, 0))
+ cur_width = 0.001;
+
+ info.space_width = cur_width;
+ has_space = true;
+ }
+
+ width_list[mapped_code] = (int)floor(cur_width * info.em_size + 0.5);
+ }
+
+ if(param.debug)
+ {
+ map_outf << hex << cur_code << ' ' << mapped_code << ' ' << u << endl;
+ }
+ }
+
+ ffw_set_widths(width_list.data(), max_key + 1, param.stretch_narrow_glyph, param.squeeze_wide_glyph);
+
+ ffw_reencode_raw(cur_mapping.data(), max_key + 1, 1);
+
+ // In some space offsets in HTML, we insert a ' ' there in order to improve text copy&paste
+ // We need to make sure that ' ' is in the font, otherwise it would be very ugly if you select the text
+ // Might be a problem if ' ' is in the font, but not empty
+ if(!has_space)
+ {
+ if(font_8bit)
+ {
+ info.space_width = font_8bit->getWidth(' ');
+ }
+ else
+ {
+ char buf[2] = {0, ' '};
+ info.space_width = font_cid->getWidth(buf, 2);
+ }
+ info.space_width /= info.font_size_scale;
+
+ /* See comments above */
+ if(equal(info.space_width,0))
+ info.space_width = 0.001;
+
+ ffw_add_empty_char((int32_t)' ', (int)floor(info.space_width * info.em_size + 0.5));
+ if(param.debug)
+ {
+ cerr << "Missing space width in font " << hex << info.id << ": set to " << dec << info.space_width << endl;
+ }
+ }
+
+ if(param.debug)
+ {
+ cerr << "space width: " << info.space_width << endl;
+ }
+
+ if(ctu)
+ ctu->decRefCnt();
+ }
+
+ /*
+ * Step 3
+ * Generate the font as desired
+ */
+
+ // Reencode to Unicode Full such that FontForge won't ditch unicode values larger than 0xFFFF
+ ffw_reencode_unicode_full();
+
+ // Due to a bug of Fontforge about pfa -> woff conversion
+ // we always generate TTF first, instead of the format specified by user
+ string cur_tmp_fn = (char*)str_fmt("%s/__tmp_font1.%s", param.tmp_dir.c_str(), "ttf");
+ tmp_files.add(cur_tmp_fn);
+ string other_tmp_fn = (char*)str_fmt("%s/__tmp_font2.%s", param.tmp_dir.c_str(), "ttf");
+ tmp_files.add(other_tmp_fn);
+
+ ffw_save(cur_tmp_fn.c_str());
+
+ ffw_close();
+
+ /*
+ * Step 4
+ * Font Hinting
+ */
+ bool hinted = false;
+
+ // Call external hinting program if specified
+ if(param.external_hint_tool != "")
+ {
+ hinted = (system((char*)str_fmt("%s \"%s\" \"%s\"", param.external_hint_tool.c_str(), cur_tmp_fn.c_str(), other_tmp_fn.c_str())) == 0);
+ }
+
+ // Call internal hinting procedure if specified
+ if((!hinted) && (param.auto_hint))
+ {
+ ffw_load_font(cur_tmp_fn.c_str());
+ ffw_auto_hint();
+ ffw_save(other_tmp_fn.c_str());
+ ffw_close();
+ hinted = true;
+ }
+
+ if(hinted)
+ {
+ swap(cur_tmp_fn, other_tmp_fn);
+ }
+
+ /*
+ * Step 5
+ * Generate the font, load the metrics and set the embedding bits (fstype)
+ *
+ * Ascent/Descent are not used in PDF, and the values in PDF may be wrong or inconsistent (there are 3 sets of them)
+ * We need to reload in order to retrieve/fix accurate ascent/descent, some info won't be written to the font by fontforge until saved.
+ */
+ string fn = (char*)str_fmt("%s/f%llx.%s",
+ (param.embed_font ? param.tmp_dir : param.dest_dir).c_str(),
+ info.id, param.font_format.c_str());
+
+ if(param.embed_font)
+ tmp_files.add(fn);
+
+ ffw_load_font(cur_tmp_fn.c_str());
+ ffw_fix_metric();
+ ffw_get_metric(&info.ascent, &info.descent);
+ if(param.override_fstype)
+ ffw_override_fstype();
+ ffw_save(fn.c_str());
+
+ ffw_close();
+}
+
+
+const FontInfo * HTMLRenderer::install_font(GfxFont * font)
+{
+ assert(sizeof(long long) == 2*sizeof(int));
+
+ long long fn_id = (font == nullptr) ? 0 : hash_ref(font->getID());
+
+ auto iter = font_info_map.find(fn_id);
+ if(iter != font_info_map.end())
+ return &(iter->second);
+
+ long long new_fn_id = font_info_map.size();
+
+ auto cur_info_iter = font_info_map.insert(make_pair(fn_id, FontInfo())).first;
+
+ FontInfo & new_font_info = cur_info_iter->second;
+ new_font_info.id = new_fn_id;
+ new_font_info.use_tounicode = true;
+ new_font_info.font_size_scale = 1.0;
+
+ if(font == nullptr)
+ {
+ new_font_info.em_size = 0;
+ new_font_info.space_width = 0;
+ new_font_info.ascent = 0;
+ new_font_info.descent = 0;
+ new_font_info.is_type3 = false;
+
+ export_remote_default_font(new_fn_id);
+
+ return &(new_font_info);
+ }
+
+ new_font_info.ascent = font->getAscent();
+ new_font_info.descent = font->getDescent();
+ new_font_info.is_type3 = (font->getType() == fontType3);
+
+ if(param.debug)
+ {
+ cerr << "Install font " << hex << new_fn_id << dec
+ << ": (" << (font->getID()->num) << ' ' << (font->getID()->gen) << ") "
+ << (font->getName() ? font->getName()->getCString() : "")
+ << endl;
+ }
+
+ if(new_font_info.is_type3)
+ {
+#if ENABLE_SVG
+ if(param.process_type3)
+ {
+ install_embedded_font(font, new_font_info);
+ }
+ else
+ {
+ export_remote_default_font(new_fn_id);
+ }
+#else
+ cerr << "Type 3 fonts are unsupported and will be rendered as Image" << endl;
+ export_remote_default_font(new_fn_id);
+#endif
+ return &new_font_info;
+ }
+ if(font->getWMode()) {
+ cerr << "Writing mode is unsupported and will be rendered as Image" << endl;
+ export_remote_default_font(new_fn_id);
+ return &new_font_info;
+ }
+
+ /*
+ * The 2nd parameter of locateFont should be true only for PS
+ * which does not make much sense in our case
+ * If we specify gFalse here, font_loc->locaType cannot be gfxFontLocResident
+ */
+ if(auto * font_loc = font->locateFont(xref, nullptr))
+ {
+ switch(font_loc -> locType)
+ {
+ case gfxFontLocEmbedded:
+ install_embedded_font(font, new_font_info);
+ break;
+ case gfxFontLocResident:
+ std::cerr << "Warning: Base 14 fonts should not be specially handled now. Please report a bug!" << std::endl;
+ /* fall through */
+ case gfxFontLocExternal:
+ install_external_font(font, new_font_info);
+ break;
+ default:
+ cerr << "TODO: other font loc" << endl;
+ export_remote_default_font(new_fn_id);
+ break;
+ }
+ delete font_loc;
+ }
+ else
+ {
+ export_remote_default_font(new_fn_id);
+ }
+
+ return &new_font_info;
+}
+
+void HTMLRenderer::install_embedded_font(GfxFont * font, FontInfo & info)
+{
+ auto path = dump_embedded_font(font, info);
+
+ if(path != "")
+ {
+ embed_font(path, font, info);
+ export_remote_font(info, param.font_format, font);
+ }
+ else
+ {
+ export_remote_default_font(info.id);
+ }
+}
+
+void HTMLRenderer::install_external_font(GfxFont * font, FontInfo & info)
+{
+ string fontname(font->getName()->getCString());
+
+ // resolve bad encodings in GB
+ auto iter = GB_ENCODED_FONT_NAME_MAP.find(fontname);
+ if(iter != GB_ENCODED_FONT_NAME_MAP.end())
+ {
+ fontname = iter->second;
+ cerr << "Warning: workaround for font names in bad encodings." << endl;
+ }
+
+ GfxFontLoc * localfontloc = font->locateFont(xref, nullptr);
+
+ if(param.embed_external_font)
+ {
+ if(localfontloc != nullptr)
+ {
+ embed_font(string(localfontloc->path->getCString()), font, info);
+ export_remote_font(info, param.font_format, font);
+ delete localfontloc;
+ return;
+ }
+ else
+ {
+ cerr << "Cannot embed external font: f" << hex << info.id << dec << ' ' << fontname << endl;
+ // fallback to exporting by name
+ }
+ }
+
+ // still try to get an idea of read ascent/descent
+ if(localfontloc != nullptr)
+ {
+ // fill in ascent/descent only, do not embed
+ embed_font(string(localfontloc->path->getCString()), font, info, true);
+ delete localfontloc;
+ }
+ else
+ {
+ info.ascent = font->getAscent();
+ info.descent = font->getDescent();
+ }
+
+ export_local_font(info, font, fontname, "");
+}
+
+void HTMLRenderer::export_remote_font(const FontInfo & info, const string & format, GfxFont * font)
+{
+ string css_font_format;
+ if(format == "ttf")
+ {
+ css_font_format = "truetype";
+ }
+ else if(format == "otf")
+ {
+ css_font_format = "opentype";
+ }
+ else if(format == "woff")
+ {
+ css_font_format = "woff";
+ }
+ else if(format == "eot")
+ {
+ css_font_format = "embedded-opentype";
+ }
+ else if(format == "svg")
+ {
+ css_font_format = "svg";
+ }
+ else
+ {
+ throw string("Warning: unknown font format: ") + format;
+ }
+ auto iter = FORMAT_MIME_TYPE_MAP.find(format);
+ if(iter == FORMAT_MIME_TYPE_MAP.end())
+ {
+ throw string("Warning: unknown font format: ") + format;
+ }
+ string mime_type = iter->second;
+
+ f_css.fs << "@font-face{"
+ << "font-family:" << CSS::FONT_FAMILY_CN << info.id << ";"
+ << "src:url(";
+
+ {
+ auto fn = str_fmt("f%llx.%s", info.id, format.c_str());
+ if(param.embed_font)
+ {
+ auto path = param.tmp_dir + "/" + (char*)fn;
+ ifstream fin(path, ifstream::binary);
+ if(!fin)
+ throw "Cannot locate font file: " + path;
+ f_css.fs << "'data:" + mime_type + ";base64," << Base64Stream(fin) << "'";
+ }
+ else
+ {
+ f_css.fs << (char*)fn;
+ }
+ }
+
+ f_css.fs << ")"
+ << "format(\"" << css_font_format << "\");"
+ << "}" // end of @font-face
+ << "." << CSS::FONT_FAMILY_CN << info.id << "{"
+ << "font-family:" << CSS::FONT_FAMILY_CN << info.id << ";"
+ << "line-height:" << round(info.ascent - info.descent) << ";"
+ << "font-style:normal;"
+ << "font-weight:normal;"
+ << "visibility:visible;"
+ << "}"
+ << endl;
+}
+
+static string general_font_family(GfxFont * font)
+{
+ if(font->isFixedWidth())
+ return "monospace";
+ else if (font->isSerif())
+ return "serif";
+ else
+ return "sans-serif";
+}
+
+// TODO: this function is called when some font is unable to process, may use the name there as a hint
+void HTMLRenderer::export_remote_default_font(long long fn_id)
+{
+ f_css.fs << "." << CSS::FONT_FAMILY_CN << fn_id << "{font-family:sans-serif;visibility:hidden;}" << endl;
+}
+
+void HTMLRenderer::export_local_font(const FontInfo & info, GfxFont * font, const string & original_font_name, const string & cssfont)
+{
+ f_css.fs << "." << CSS::FONT_FAMILY_CN << info.id << "{";
+ f_css.fs << "font-family:" << ((cssfont == "") ? (original_font_name + "," + general_font_family(font)) : cssfont) << ";";
+
+ string fn = original_font_name;
+ for(auto & c : fn)
+ c = tolower(c);
+
+ if(font->isBold() || (fn.find("bold") != string::npos))
+ f_css.fs << "font-weight:bold;";
+ else
+ f_css.fs << "font-weight:normal;";
+
+ if(fn.find("oblique") != string::npos)
+ f_css.fs << "font-style:oblique;";
+ else if(font->isItalic() || (fn.find("italic") != string::npos))
+ f_css.fs << "font-style:italic;";
+ else
+ f_css.fs << "font-style:normal;";
+
+ f_css.fs << "line-height:" << round(info.ascent - info.descent) << ";";
+
+ f_css.fs << "visibility:visible;";
+
+ f_css.fs << "}" << endl;
+}
+
+} //namespace pdf2htmlEX
diff --git a/src/HTMLRenderer/form.cc b/src/HTMLRenderer/form.cc
new file mode 100644
index 0000000..6b51622
--- /dev/null
+++ b/src/HTMLRenderer/form.cc
@@ -0,0 +1,76 @@
+/*
+ * form.cc
+ *
+ * Handling Forms
+ *
+ * by Simon Chenard
+ * 2014.07.25
+ */
+
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include "HTMLRenderer.h"
+#include "util/namespace.h"
+#include "util/misc.h"
+
+namespace pdf2htmlEX {
+
+using std::ofstream;
+using std::cerr;
+
+void HTMLRenderer::process_form(ofstream & out)
+{
+ FormPageWidgets * widgets = cur_catalog->getPage(pageNum)->getFormWidgets();
+ int num = widgets->getNumWidgets();
+
+ for(int i = 0; i < num; i++)
+ {
+ FormWidget * w = widgets->getWidget(i);
+ double x1, y1, x2, y2;
+
+ w->getRect(&x1, &y1, &x2, &y2);
+ x1 = x1 * param.zoom;
+ x2 = x2 * param.zoom;
+ y1 = y1 * param.zoom;
+ y2 = y2 * param.zoom;
+
+ double width = x2 - x1;
+ double height = y2 - y1;
+
+ if(w->getType() == formText)
+ {
+ double font_size = height / 2;
+
+ out << "<input id=\"text-" << pageNum << "-" << i
+ << "\" class=\"" << CSS::INPUT_TEXT_CN
+ << "\" type=\"text\" value=\"\""
+ << " style=\"position: absolute; left: " << x1
+ << "px; bottom: " << y1 << "px;"
+ << " width: " << width << "px; height: " << std::to_string(height)
+ << "px; line-height: " << std::to_string(height) << "px; font-size: "
+ << font_size << "px;\" />" << endl;
+ }
+ else if(w->getType() == formButton)
+ {
+ //Ideally would check w->getButtonType()
+ //for more specific rendering
+ width += 3;
+ height += 3;
+
+ out << "<div id=\"cb-" << pageNum << "-" << i
+ << "\" class=\"" << CSS::INPUT_RADIO_CN
+ << "\" style=\"position: absolute; left: " << x1
+ << "px; bottom: " << y1 << "px;"
+ << " width: " << width << "px; height: "
+ << std::to_string(height) << "px; background-size: cover;\" ></div>" << endl;
+ }
+ else
+ {
+ cerr << "Unsupported form field detected" << endl;
+ }
+ }
+}
+
+}
diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc
new file mode 100644
index 0000000..6a54194
--- /dev/null
+++ b/src/HTMLRenderer/general.cc
@@ -0,0 +1,592 @@
+/*
+ * general.cc
+ *
+ * Handling general stuffs
+ *
+ * Copyright (C) 2012,2013,2014 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#include <cstdio>
+#include <ostream>
+#include <cmath>
+#include <algorithm>
+#include <vector>
+#include <functional>
+
+#include <GlobalParams.h>
+
+#include "pdf2htmlEX-config.h"
+#include "HTMLRenderer.h"
+#include "HTMLTextLine.h"
+#include "Base64Stream.h"
+
+#include "BackgroundRenderer/BackgroundRenderer.h"
+
+#include "util/namespace.h"
+#include "util/ffw.h"
+#include "util/math.h"
+#include "util/path.h"
+#include "util/css_const.h"
+#include "util/encoding.h"
+
+namespace pdf2htmlEX {
+
+using std::fixed;
+using std::flush;
+using std::ostream;
+using std::max;
+using std::min_element;
+using std::vector;
+using std::abs;
+using std::cerr;
+using std::endl;
+
+HTMLRenderer::HTMLRenderer(const Param & param)
+ :OutputDev()
+ ,param(param)
+ ,html_text_page(param, all_manager)
+ ,preprocessor(param)
+ ,tmp_files(param)
+ ,tracer(param)
+{
+ if(!(param.debug))
+ {
+ //disable error messages of poppler
+ globalParams->setErrQuiet(gTrue);
+ }
+
+ ffw_init(param.debug);
+
+ cur_mapping.resize(0x10000);
+ cur_mapping2.resize(0x100);
+ width_list.resize(0x10000);
+
+ /*
+ * For these states, usually the error will not be accumulated
+ * or may be handled well (whitespace_manager)
+ * So we can set a large eps here
+ */
+ all_manager.vertical_align.set_eps(param.v_eps);
+ all_manager.whitespace .set_eps(param.h_eps);
+ all_manager.left .set_eps(param.h_eps);
+ /*
+ * For other states, we need accurate values
+ * optimization will be done separately
+ */
+ all_manager.font_size .set_eps(EPS);
+ all_manager.letter_space.set_eps(EPS);
+ all_manager.word_space .set_eps(EPS);
+ all_manager.height .set_eps(EPS);
+ all_manager.width .set_eps(EPS);
+ all_manager.bottom .set_eps(EPS);
+
+ tracer.on_char_drawn =
+ [this](double * box) { covered_text_detector.add_char_bbox(box); };
+ tracer.on_char_clipped =
+ [this](double * box, bool partial) { covered_text_detector.add_char_bbox_clipped(box, partial); };
+ tracer.on_non_char_drawn =
+ [this](double * box) { covered_text_detector.add_non_char_bbox(box); };
+}
+
+HTMLRenderer::~HTMLRenderer()
+{
+ ffw_finalize();
+}
+
+void HTMLRenderer::process(PDFDoc *doc)
+{
+ cur_doc = doc;
+ cur_catalog = doc->getCatalog();
+ xref = doc->getXRef();
+
+ pre_process(doc);
+
+ ///////////////////
+ // Process pages
+
+ if(param.process_nontext)
+ {
+ bg_renderer = BackgroundRenderer::getBackgroundRenderer(param.bg_format, this, param);
+ if(!bg_renderer)
+ throw "Cannot initialize background renderer, unsupported format";
+ bg_renderer->init(doc);
+
+ fallback_bg_renderer = BackgroundRenderer::getFallbackBackgroundRenderer(this, param);
+ if (fallback_bg_renderer)
+ fallback_bg_renderer->init(doc);
+ }
+
+ int page_count = (param.last_page - param.first_page + 1);
+ for(int i = param.first_page; i <= param.last_page ; ++i)
+ {
+ if (param.tmp_file_size_limit != -1 && tmp_files.get_total_size() > param.tmp_file_size_limit * 1024) {
+ cerr << "Stop processing, reach max size\n";
+ break;
+ }
+
+ cerr << "Working: " << (i-param.first_page) << "/" << page_count << '\r' << flush;
+
+ if(param.split_pages)
+ {
+ // copy the string out, since we will reuse the buffer soon
+ string filled_template_filename = (char*)str_fmt(param.page_filename.c_str(), i);
+ auto page_fn = str_fmt("%s/%s", param.dest_dir.c_str(), filled_template_filename.c_str());
+ f_curpage = new ofstream((char*)page_fn, ofstream::binary);
+ if(!(*f_curpage))
+ throw string("Cannot open ") + (char*)page_fn + " for writing";
+ set_stream_flags((*f_curpage));
+
+ cur_page_filename = filled_template_filename;
+ }
+
+ doc->displayPage(this, i,
+ text_zoom_factor() * DEFAULT_DPI, text_zoom_factor() * DEFAULT_DPI,
+ 0,
+ (!(param.use_cropbox)),
+ true, // crop
+ false, // printing
+ nullptr, nullptr, nullptr, nullptr);
+
+ if(param.split_pages)
+ {
+ delete f_curpage;
+ f_curpage = nullptr;
+ }
+ }
+ if(page_count >= 0)
+ cerr << "Working: " << page_count << "/" << page_count;
+ cerr << endl;
+
+ ////////////////////////
+ // Process Outline
+ if(param.process_outline)
+ process_outline();
+
+ post_process();
+
+ bg_renderer = nullptr;
+ fallback_bg_renderer = nullptr;
+
+ cerr << endl;
+}
+
+void HTMLRenderer::setDefaultCTM(double *ctm)
+{
+ memcpy(default_ctm, ctm, sizeof(default_ctm));
+}
+
+void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref)
+{
+ covered_text_detector.reset();
+ tracer.reset(state);
+
+ this->pageNum = pageNum;
+
+ html_text_page.set_page_size(state->getPageWidth(), state->getPageHeight());
+
+ reset_state();
+}
+
+void HTMLRenderer::endPage() {
+ long long wid = all_manager.width.install(html_text_page.get_width());
+ long long hid = all_manager.height.install(html_text_page.get_height());
+
+ (*f_curpage)
+ << "<div id=\"" << CSS::PAGE_FRAME_CN << pageNum
+ << "\" class=\"" << CSS::PAGE_FRAME_CN
+ << " " << CSS::WIDTH_CN << wid
+ << " " << CSS::HEIGHT_CN << hid
+ << "\" data-page-no=\"" << pageNum << "\">"
+ << "<div class=\"" << CSS::PAGE_CONTENT_BOX_CN
+ << " " << CSS::PAGE_CONTENT_BOX_CN << pageNum
+ << " " << CSS::WIDTH_CN << wid
+ << " " << CSS::HEIGHT_CN << hid
+ << "\">";
+
+ /*
+ * When split_pages is on, f_curpage points to the current page file
+ * and we want to output empty frames in f_pages.fs
+ */
+ if(param.split_pages)
+ {
+ f_pages.fs
+ << "<div id=\"" << CSS::PAGE_FRAME_CN << pageNum
+ << "\" class=\"" << CSS::PAGE_FRAME_CN
+ << " " << CSS::WIDTH_CN << wid
+ << " " << CSS::HEIGHT_CN << hid
+ << "\" data-page-no=\"" << pageNum
+ << "\" data-page-url=\"";
+
+ writeAttribute(f_pages.fs, cur_page_filename);
+ f_pages.fs << "\">";
+ }
+
+ if(param.process_nontext)
+ {
+ if (bg_renderer->render_page(cur_doc, pageNum))
+ {
+ bg_renderer->embed_image(pageNum);
+ }
+ else if (fallback_bg_renderer)
+ {
+ if (fallback_bg_renderer->render_page(cur_doc, pageNum))
+ fallback_bg_renderer->embed_image(pageNum);
+ }
+ }
+
+ // dump all text
+ html_text_page.dump_text(*f_curpage);
+ html_text_page.dump_css(f_css.fs);
+ html_text_page.clear();
+
+ // process form
+ if(param.process_form)
+ process_form(*f_curpage);
+
+ // process links before the page is closed
+ cur_doc->processLinks(this, pageNum);
+
+ // close box
+ (*f_curpage) << "</div>";
+
+ // dump info for js
+ // TODO: create a function for this
+ // BE CAREFUL WITH ESCAPES
+ {
+ (*f_curpage) << "<div class=\"" << CSS::PAGE_DATA_CN << "\" data-data='{";
+
+ //default CTM
+ (*f_curpage) << "\"ctm\":[";
+ for(int i = 0; i < 6; ++i)
+ {
+ if(i > 0) (*f_curpage) << ",";
+ (*f_curpage) << round(default_ctm[i]);
+ }
+ (*f_curpage) << "]";
+
+ (*f_curpage) << "}'></div>";
+ }
+
+ // close page
+ (*f_curpage) << "</div>" << endl;
+
+ if(param.split_pages)
+ {
+ f_pages.fs << "</div>" << endl;
+ }
+}
+
+void HTMLRenderer::pre_process(PDFDoc * doc)
+{
+ preprocessor.process(doc);
+
+ /*
+ * determine scale factors
+ */
+ {
+ vector<double> zoom_factors;
+
+ if(is_positive(param.zoom))
+ {
+ zoom_factors.push_back(param.zoom);
+ }
+
+ if(is_positive(param.fit_width))
+ {
+ zoom_factors.push_back((param.fit_width) / preprocessor.get_max_width());
+ }
+
+ if(is_positive(param.fit_height))
+ {
+ zoom_factors.push_back((param.fit_height) / preprocessor.get_max_height());
+ }
+
+ double zoom = (zoom_factors.empty() ? 1.0 : (*min_element(zoom_factors.begin(), zoom_factors.end())));
+
+ text_scale_factor1 = max<double>(zoom, param.font_size_multiplier);
+ text_scale_factor2 = zoom / text_scale_factor1;
+ }
+
+ // we may output utf8 characters, so always use binary
+ {
+ /*
+ * If embed-css
+ * we have to keep the generated css file into a temporary place
+ * and embed it into the main html later
+ *
+ * otherwise
+ * leave it in param.dest_dir
+ */
+
+ auto fn = (param.embed_css)
+ ? str_fmt("%s/__css", param.tmp_dir.c_str())
+ : str_fmt("%s/%s", param.dest_dir.c_str(), param.css_filename.c_str());
+
+ if(param.embed_css)
+ tmp_files.add((char*)fn);
+
+ f_css.path = (char*)fn;
+ f_css.fs.open(f_css.path, ofstream::binary);
+ if(!f_css.fs)
+ throw string("Cannot open ") + (char*)fn + " for writing";
+ set_stream_flags(f_css.fs);
+ }
+
+ if (param.process_outline)
+ {
+ /*
+ * The logic for outline is similar to css
+ */
+
+ auto fn = (param.embed_outline)
+ ? str_fmt("%s/__outline", param.tmp_dir.c_str())
+ : str_fmt("%s/%s", param.dest_dir.c_str(), param.outline_filename.c_str());
+
+ if(param.embed_outline)
+ tmp_files.add((char*)fn);
+
+ f_outline.path = (char*)fn;
+ f_outline.fs.open(f_outline.path, ofstream::binary);
+ if(!f_outline.fs)
+ throw string("Cannot open") + (char*)fn + " for writing";
+
+ // might not be necessary
+ set_stream_flags(f_outline.fs);
+ }
+
+ {
+ /*
+ * we have to keep the html file for pages into a temporary place
+ * because we'll have to embed css before it
+ *
+ * Otherwise just generate it
+ */
+ auto fn = str_fmt("%s/__pages", param.tmp_dir.c_str());
+ tmp_files.add((char*)fn);
+
+ f_pages.path = (char*)fn;
+ f_pages.fs.open(f_pages.path, ofstream::binary);
+ if(!f_pages.fs)
+ throw string("Cannot open ") + (char*)fn + " for writing";
+ set_stream_flags(f_pages.fs);
+ }
+
+ if(param.split_pages)
+ {
+ f_curpage = nullptr;
+ }
+ else
+ {
+ f_curpage = &f_pages.fs;
+ }
+}
+
+void HTMLRenderer::post_process(void)
+{
+ dump_css();
+
+ // close files if they opened
+ if (param.process_outline)
+ {
+ f_outline.fs.close();
+ }
+ f_pages.fs.close();
+ f_css.fs.close();
+
+ // build the main HTML file
+ ofstream output;
+ {
+ auto fn = str_fmt("%s/%s", param.dest_dir.c_str(), param.output_filename.c_str());
+ output.open((char*)fn, ofstream::binary);
+ if(!output)
+ throw string("Cannot open ") + (char*)fn + " for writing";
+ set_stream_flags(output);
+ }
+
+ // apply manifest
+ ifstream manifest_fin((char*)str_fmt("%s/%s", param.data_dir.c_str(), MANIFEST_FILENAME.c_str()), ifstream::binary);
+ if(!manifest_fin)
+ throw "Cannot open the manifest file";
+
+ bool embed_string = false;
+ string line;
+ long line_no = 0;
+ while(getline(manifest_fin, line))
+ {
+ // trim space at both sides
+ {
+ static const char * whitespaces = " \t\n\v\f\r";
+ auto idx1 = line.find_first_not_of(whitespaces);
+ if(idx1 == string::npos)
+ {
+ line.clear();
+ }
+ else
+ {
+ auto idx2 = line.find_last_not_of(whitespaces);
+ assert(idx2 >= idx1);
+ line = line.substr(idx1, idx2 - idx1 + 1);
+ }
+ }
+
+ ++line_no;
+
+ if(line == "\"\"\"")
+ {
+ embed_string = !embed_string;
+ continue;
+ }
+
+ if(embed_string)
+ {
+ output << line << endl;
+ continue;
+ }
+
+ if(line.empty() || line[0] == '#')
+ continue;
+
+
+ if(line[0] == '@')
+ {
+ embed_file(output, param.data_dir + "/" + line.substr(1), "", true);
+ continue;
+ }
+
+ if(line[0] == '$')
+ {
+ if(line == "$css")
+ {
+ embed_file(output, f_css.path, ".css", false);
+ }
+ else if (line == "$outline")
+ {
+ if (param.process_outline && param.embed_outline)
+ {
+ ifstream fin(f_outline.path, ifstream::binary);
+ if(!fin)
+ throw "Cannot open outline for reading";
+ output << fin.rdbuf();
+ output.clear(); // output will set fail big if fin is empty
+ }
+ }
+ else if (line == "$pages")
+ {
+ ifstream fin(f_pages.path, ifstream::binary);
+ if(!fin)
+ throw "Cannot open pages for reading";
+ output << fin.rdbuf();
+ output.clear(); // output will set fail bit if fin is empty
+ }
+ else
+ {
+ cerr << "Warning: manifest line " << line_no << ": Unknown content \"" << line << "\"" << endl;
+ }
+ continue;
+ }
+
+ cerr << "Warning: unknown line in manifest: " << line << endl;
+ }
+}
+
+void HTMLRenderer::set_stream_flags(std::ostream & out)
+{
+ // we output all ID's in hex
+ // browsers are not happy with scientific notations
+ out << hex << fixed;
+}
+
+void HTMLRenderer::dump_css (void)
+{
+ all_manager.transform_matrix.dump_css(f_css.fs);
+ all_manager.vertical_align .dump_css(f_css.fs);
+ all_manager.letter_space .dump_css(f_css.fs);
+ all_manager.stroke_color .dump_css(f_css.fs);
+ all_manager.word_space .dump_css(f_css.fs);
+ all_manager.whitespace .dump_css(f_css.fs);
+ all_manager.fill_color .dump_css(f_css.fs);
+ all_manager.font_size .dump_css(f_css.fs);
+ all_manager.bottom .dump_css(f_css.fs);
+ all_manager.height .dump_css(f_css.fs);
+ all_manager.width .dump_css(f_css.fs);
+ all_manager.left .dump_css(f_css.fs);
+ all_manager.bgimage_size .dump_css(f_css.fs);
+
+ // print css
+ if(param.printing)
+ {
+ double ps = print_scale();
+ f_css.fs << CSS::PRINT_ONLY << "{" << endl;
+ all_manager.transform_matrix.dump_print_css(f_css.fs, ps);
+ all_manager.vertical_align .dump_print_css(f_css.fs, ps);
+ all_manager.letter_space .dump_print_css(f_css.fs, ps);
+ all_manager.stroke_color .dump_print_css(f_css.fs, ps);
+ all_manager.word_space .dump_print_css(f_css.fs, ps);
+ all_manager.whitespace .dump_print_css(f_css.fs, ps);
+ all_manager.fill_color .dump_print_css(f_css.fs, ps);
+ all_manager.font_size .dump_print_css(f_css.fs, ps);
+ all_manager.bottom .dump_print_css(f_css.fs, ps);
+ all_manager.height .dump_print_css(f_css.fs, ps);
+ all_manager.width .dump_print_css(f_css.fs, ps);
+ all_manager.left .dump_print_css(f_css.fs, ps);
+ all_manager.bgimage_size .dump_print_css(f_css.fs, ps);
+ f_css.fs << "}" << endl;
+ }
+}
+
+void HTMLRenderer::embed_file(ostream & out, const string & path, const string & type, bool copy)
+{
+ string fn = get_filename(path);
+ string suffix = (type == "") ? get_suffix(fn) : type;
+
+ auto iter = EMBED_STRING_MAP.find(suffix);
+ if(iter == EMBED_STRING_MAP.end())
+ {
+ cerr << "Warning: unknown suffix: " << suffix << endl;
+ return;
+ }
+
+ const auto & entry = iter->second;
+
+ if(param.*(entry.embed_flag))
+ {
+ ifstream fin(path, ifstream::binary);
+ if(!fin)
+ throw string("Cannot open file ") + path + " for embedding";
+ out << entry.prefix_embed;
+
+ if(entry.base64_encode)
+ {
+ out << Base64Stream(fin);
+ }
+ else
+ {
+ out << endl << fin.rdbuf();
+ }
+ out.clear(); // out will set fail big if fin is empty
+ out << entry.suffix_embed << endl;
+ }
+ else
+ {
+ out << entry.prefix_external;
+ writeAttribute(out, fn);
+ out << entry.suffix_external << endl;
+
+ if(copy)
+ {
+ ifstream fin(path, ifstream::binary);
+ if(!fin)
+ throw string("Cannot copy file: ") + path;
+ auto out_path = param.dest_dir + "/" + fn;
+ ofstream out(out_path, ofstream::binary);
+ if(!out)
+ throw string("Cannot open file ") + path + " for embedding";
+ out << fin.rdbuf();
+ out.clear(); // out will set fail big if fin is empty
+ }
+ }
+}
+
+const std::string HTMLRenderer::MANIFEST_FILENAME = "manifest";
+
+}// namespace pdf2htmlEX
diff --git a/src/HTMLRenderer/image.cc b/src/HTMLRenderer/image.cc
new file mode 100644
index 0000000..91ca767
--- /dev/null
+++ b/src/HTMLRenderer/image.cc
@@ -0,0 +1,83 @@
+/*
+ * image.cc
+ *
+ * Handling images
+ *
+ * by WangLu
+ * 2012.08.14
+ */
+
+#include "HTMLRenderer.h"
+#include "util/namespace.h"
+
+namespace pdf2htmlEX {
+
+void HTMLRenderer::drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg)
+{
+ tracer.draw_image(state);
+
+ return OutputDev::drawImage(state,ref,str,width,height,colorMap,interpolate,maskColors,inlineImg);
+
+#if 0
+ if(maskColors)
+ return;
+
+ rgb8_image_t img(width, height);
+ auto imgview = view(img);
+ auto loc = imgview.xy_at(0,0);
+
+ ImageStream * img_stream = new ImageStream(str, width, colorMap->getNumPixelComps(), colorMap->getBits());
+ img_stream->reset();
+
+ for(int i = 0; i < height; ++i)
+ {
+ auto p = img_stream->getLine();
+ for(int j = 0; j < width; ++j)
+ {
+ GfxRGB rgb;
+ colorMap->getRGB(p, &rgb);
+
+ *loc = rgb8_pixel_t(colToByte(rgb.r), colToByte(rgb.g), colToByte(rgb.b));
+
+ p += colorMap->getNumPixelComps();
+
+ ++ loc.x();
+ }
+
+ loc = imgview.xy_at(0, i+1);
+ }
+
+ png_write_view((format("i%|1$x|.png")%image_count).str(), imgview);
+
+ img_stream->close();
+ delete img_stream;
+
+ close_line();
+
+ double ctm[6];
+ memcpy(ctm, state->getCTM(), sizeof(ctm));
+ ctm[4] = ctm[5] = 0.0;
+ html_fout << format("<img class=\"i t%2%\" style=\"left:%3%px;bottom:%4%px;width:%5%px;height:%6%px;\" src=\"i%|1$x|.png\" />") % image_count % install_transform_matrix(ctm) % state->getCurX() % state->getCurY() % width % height << endl;
+
+
+ ++ image_count;
+#endif
+}
+
+void HTMLRenderer::drawSoftMaskedImage(GfxState *state, Object *ref, Stream *str,
+ int width, int height,
+ GfxImageColorMap *colorMap,
+ GBool interpolate,
+ Stream *maskStr,
+ int maskWidth, int maskHeight,
+ GfxImageColorMap *maskColorMap,
+ GBool maskInterpolate)
+{
+ tracer.draw_image(state);
+
+ return OutputDev::drawSoftMaskedImage(state,ref,str, // TODO really required?
+ width,height,colorMap,interpolate,
+ maskStr, maskWidth, maskHeight, maskColorMap, maskInterpolate);
+}
+
+} // namespace pdf2htmlEX
diff --git a/src/HTMLRenderer/link.cc b/src/HTMLRenderer/link.cc
new file mode 100644
index 0000000..3c90ab5
--- /dev/null
+++ b/src/HTMLRenderer/link.cc
@@ -0,0 +1,309 @@
+/*
+ * link.cc
+ *
+ * Handling links
+ *
+ * by WangLu
+ * 2012.09.25
+ */
+
+#include <iostream>
+#include <sstream>
+#include <algorithm>
+
+#include <Link.h>
+
+#include "HTMLRenderer.h"
+#include "util/namespace.h"
+#include "util/math.h"
+#include "util/misc.h"
+#include "util/encoding.h"
+#include "util/css_const.h"
+
+namespace pdf2htmlEX {
+
+using std::ostringstream;
+using std::min;
+using std::max;
+using std::cerr;
+using std::endl;
+
+/*
+ * The detailed rectangle area of the link destination
+ * Will be parsed and performed by Javascript
+ * The string will be put into a HTML attribute, surrounded by single quotes
+ * So pay attention to the characters used here
+ */
+static string get_linkdest_detail_str(LinkDest * dest, Catalog * catalog, int & pageno)
+{
+ pageno = 0;
+ if(dest->isPageRef())
+ {
+ auto pageref = dest->getPageRef();
+ pageno = catalog->findPage(pageref.num, pageref.gen);
+ }
+ else
+ {
+ pageno = dest->getPageNum();
+ }
+
+ if(pageno <= 0)
+ {
+ return "";
+ }
+
+ ostringstream sout;
+ // dec
+ sout << "[" << pageno;
+
+ if(dest)
+ {
+ switch(dest->getKind())
+ {
+ case destXYZ:
+ {
+ sout << ",\"XYZ\",";
+ if(dest->getChangeLeft())
+ sout << (dest->getLeft());
+ else
+ sout << "null";
+ sout << ",";
+ if(dest->getChangeTop())
+ sout << (dest->getTop());
+ else
+ sout << "null";
+ sout << ",";
+ if(dest->getChangeZoom())
+ sout << (dest->getZoom());
+ else
+ sout << "null";
+ }
+ break;
+ case destFit:
+ sout << ",\"Fit\"";
+ break;
+ case destFitH:
+ sout << ",\"FitH\",";
+ if(dest->getChangeTop())
+ sout << (dest->getTop());
+ else
+ sout << "null";
+ break;
+ case destFitV:
+ sout << ",\"FitV\",";
+ if(dest->getChangeLeft())
+ sout << (dest->getLeft());
+ else
+ sout << "null";
+ break;
+ case destFitR:
+ sout << ",\"FitR\","
+ << (dest->getLeft()) << ","
+ << (dest->getBottom()) << ","
+ << (dest->getRight()) << ","
+ << (dest->getTop());
+ break;
+ case destFitB:
+ sout << ",\"FitB\"";
+ break;
+ case destFitBH:
+ sout << ",\"FitBH\",";
+ if(dest->getChangeTop())
+ sout << (dest->getTop());
+ else
+ sout << "null";
+ break;
+ case destFitBV:
+ sout << ",\"FitBV\",";
+ if(dest->getChangeLeft())
+ sout << (dest->getLeft());
+ else
+ sout << "null";
+ break;
+ default:
+ break;
+ }
+ }
+ sout << "]";
+
+ return sout.str();
+}
+
+string HTMLRenderer::get_linkaction_str(LinkAction * action, string & detail)
+{
+ string dest_str;
+ detail = "";
+ if(action)
+ {
+ auto kind = action->getKind();
+ switch(kind)
+ {
+ case actionGoTo:
+ {
+ auto * real_action = dynamic_cast<LinkGoTo*>(action);
+ LinkDest * dest = nullptr;
+ if(auto _ = real_action->getDest())
+ dest = _->copy();
+ else if (auto _ = real_action->getNamedDest())
+ dest = cur_catalog->findDest(_);
+ if(dest)
+ {
+ int pageno = 0;
+ detail = get_linkdest_detail_str(dest, cur_catalog, pageno);
+ if(pageno > 0)
+ {
+ dest_str = (char*)str_fmt("#%s%x", CSS::PAGE_FRAME_CN, pageno);
+ }
+ delete dest;
+ }
+ }
+ break;
+ case actionGoToR:
+ {
+ cerr << "TODO: actionGoToR is not implemented." << endl;
+ }
+ break;
+ case actionURI:
+ {
+ auto * real_action = dynamic_cast<LinkURI*>(action);
+ dest_str = real_action->getURI()->getCString();
+ }
+ break;
+ case actionLaunch:
+ {
+ cerr << "TODO: actionLaunch is not implemented." << endl;
+ }
+ break;
+ default:
+ cerr << "Warning: unknown annotation type: " << kind << endl;
+ break;
+ }
+ }
+
+ return dest_str;
+}
+
+/*
+ * Based on pdftohtml from poppler
+ * TODO: share rectangle draw with css-draw
+ */
+void HTMLRenderer::processLink(AnnotLink * al)
+{
+ string dest_detail_str;
+ string dest_str = get_linkaction_str(al->getAction(), dest_detail_str);
+
+ if(!dest_str.empty())
+ {
+ (*f_curpage) << "<a class=\"" << CSS::LINK_CN << "\" href=\"";
+ writeAttribute((*f_curpage), dest_str);
+ (*f_curpage) << "\"";
+
+ if(!dest_detail_str.empty())
+ (*f_curpage) << " data-dest-detail='" << dest_detail_str << "'";
+
+ (*f_curpage) << ">";
+ }
+
+ (*f_curpage) << "<div class=\"" << CSS::CSS_DRAW_CN << ' ' << CSS::TRANSFORM_MATRIX_CN
+ << all_manager.transform_matrix.install(default_ctm)
+ << "\" style=\"";
+
+ double x,y,w,h;
+ double x1, y1, x2, y2;
+ al->getRect(&x1, &y1, &x2, &y2);
+ x = min<double>(x1, x2);
+ y = min<double>(y1, y2);
+ w = max<double>(x1, x2) - x;
+ h = max<double>(y1, y2) - y;
+
+ double border_width = 0;
+ double border_top_bottom_width = 0;
+ double border_left_right_width = 0;
+ auto * border = al->getBorder();
+ if(border)
+ {
+ border_width = border->getWidth();
+ if(border_width > 0)
+ {
+ {
+ css_fix_rectangle_border_width(x1, y1, x2, y2, border_width,
+ x, y, w, h,
+ border_top_bottom_width, border_left_right_width);
+
+ if(std::abs(border_top_bottom_width - border_left_right_width) < EPS)
+ (*f_curpage) << "border-width:" << round(border_top_bottom_width) << "px;";
+ else
+ (*f_curpage) << "border-width:" << round(border_top_bottom_width) << "px " << round(border_left_right_width) << "px;";
+ }
+ auto style = border->getStyle();
+ switch(style)
+ {
+ case AnnotBorder::borderSolid:
+ (*f_curpage) << "border-style:solid;";
+ break;
+ case AnnotBorder::borderDashed:
+ (*f_curpage) << "border-style:dashed;";
+ break;
+ case AnnotBorder::borderBeveled:
+ (*f_curpage) << "border-style:outset;";
+ break;
+ case AnnotBorder::borderInset:
+ (*f_curpage) << "border-style:inset;";
+ break;
+ case AnnotBorder::borderUnderlined:
+ (*f_curpage) << "border-style:none;border-bottom-style:solid;";
+ break;
+ default:
+ cerr << "Warning:Unknown annotation border style: " << style << endl;
+ (*f_curpage) << "border-style:solid;";
+ }
+
+
+ auto color = al->getColor();
+ double r,g,b;
+ if(color && (color->getSpace() == AnnotColor::colorRGB))
+ {
+ const double * v = color->getValues();
+ r = v[0];
+ g = v[1];
+ b = v[2];
+ }
+ else
+ {
+ r = g = b = 0;
+ }
+
+ (*f_curpage) << "border-color:rgb("
+ << dec << (int)dblToByte(r) << "," << (int)dblToByte(g) << "," << (int)dblToByte(b) << hex
+ << ");";
+ }
+ else
+ {
+ (*f_curpage) << "border-style:none;";
+ }
+ }
+ else
+ {
+ (*f_curpage) << "border-style:none;";
+ }
+
+ tm_transform(default_ctm, x, y);
+
+ (*f_curpage) << "position:absolute;"
+ << "left:" << round(x) << "px;"
+ << "bottom:" << round(y) << "px;"
+ << "width:" << round(w) << "px;"
+ << "height:" << round(h) << "px;";
+
+ // fix for IE
+ (*f_curpage) << "background-color:rgba(255,255,255,0.000001);";
+
+ (*f_curpage) << "\"></div>";
+
+ if(dest_str != "")
+ {
+ (*f_curpage) << "</a>";
+ }
+}
+
+}// namespace pdf2htmlEX
diff --git a/src/HTMLRenderer/outline.cc b/src/HTMLRenderer/outline.cc
new file mode 100644
index 0000000..12c3896
--- /dev/null
+++ b/src/HTMLRenderer/outline.cc
@@ -0,0 +1,74 @@
+/*
+ * outline.cc
+ *
+ * Handling Outline items
+ *
+ * by WangLu
+ * 2013.01.28
+ */
+
+#include <iostream>
+
+#include <Outline.h>
+#include <goo/GooList.h>
+
+#include "HTMLRenderer.h"
+#include "util/namespace.h"
+#include "util/encoding.h"
+#include "util/css_const.h"
+
+namespace pdf2htmlEX {
+
+using std::ostream;
+
+void HTMLRenderer::process_outline_items(GooList * items)
+{
+ if((!items) || (items->getLength() == 0))
+ return;
+
+ f_outline.fs << "<ul>";
+
+ for(int i = 0; i < items->getLength(); ++i)
+ {
+ OutlineItem * item = (OutlineItem*)(items->get(i));
+
+ string detail;
+ string dest = get_linkaction_str(item->getAction(), detail);
+
+ // we don't care dest is empty or not.
+ f_outline.fs << "<li>" << "<a class=\"" << CSS::LINK_CN << "\" href=\"";
+ writeAttribute(f_outline.fs, dest);
+ f_outline.fs << "\"";
+
+ if(!detail.empty())
+ f_outline.fs << " data-dest-detail='" << detail << "'";
+
+ f_outline.fs << ">";
+
+ writeUnicodes(f_outline.fs, item->getTitle(), item->getTitleLength());
+
+ f_outline.fs << "</a>";
+
+ // check kids
+ item->open();
+ if(item->hasKids())
+ {
+ process_outline_items(item->getKids());
+ }
+ item->close();
+ f_outline.fs << "</li>";
+ }
+
+ f_outline.fs << "</ul>";
+}
+
+void HTMLRenderer::process_outline()
+{
+ Outline * outline = cur_doc->getOutline();
+ if(!outline)
+ return;
+
+ process_outline_items(outline->getItems());
+}
+
+}// namespace pdf2htmlEX
diff --git a/src/HTMLRenderer/state.cc b/src/HTMLRenderer/state.cc
new file mode 100644
index 0000000..f26b17f
--- /dev/null
+++ b/src/HTMLRenderer/state.cc
@@ -0,0 +1,541 @@
+/*
+ * state.cc
+ *
+ * track PDF states
+ *
+ * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#include <cmath>
+#include <algorithm>
+
+#include "HTMLRenderer.h"
+
+#include "util/namespace.h"
+#include "util/math.h"
+
+namespace pdf2htmlEX {
+
+using std::max;
+using std::abs;
+
+void HTMLRenderer::updateAll(GfxState * state)
+{
+ all_changed = true;
+ updateTextPos(state);
+}
+void HTMLRenderer::updateRise(GfxState * state)
+{
+ rise_changed = true;
+}
+void HTMLRenderer::updateTextPos(GfxState * state)
+{
+ text_pos_changed = true;
+ cur_tx = state->getLineX();
+ cur_ty = state->getLineY();
+}
+void HTMLRenderer::updateTextShift(GfxState * state, double shift)
+{
+ text_pos_changed = true;
+ cur_tx -= shift * 0.001 * state->getFontSize() * state->getHorizScaling();
+}
+void HTMLRenderer::updateFont(GfxState * state)
+{
+ font_changed = true;
+}
+void HTMLRenderer::updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32)
+{
+ ctm_changed = true;
+ tracer.update_ctm(state, m11, m12, m21, m22, m31, m32);
+}
+void HTMLRenderer::updateTextMat(GfxState * state)
+{
+ text_mat_changed = true;
+}
+void HTMLRenderer::updateHorizScaling(GfxState * state)
+{
+ hori_scale_changed = true;
+}
+void HTMLRenderer::updateCharSpace(GfxState * state)
+{
+ letter_space_changed = true;
+}
+void HTMLRenderer::updateWordSpace(GfxState * state)
+{
+ word_space_changed = true;
+}
+void HTMLRenderer::updateRender(GfxState * state)
+{
+ // currently Render is traced for color only
+ // might need something like render_changed later
+ fill_color_changed = true;
+ stroke_color_changed = true;
+}
+void HTMLRenderer::updateFillColorSpace(GfxState * state)
+{
+ fill_color_changed = true;
+}
+void HTMLRenderer::updateStrokeColorSpace(GfxState * state)
+{
+ stroke_color_changed = true;
+}
+void HTMLRenderer::updateFillColor(GfxState * state)
+{
+ fill_color_changed = true;
+}
+void HTMLRenderer::updateStrokeColor(GfxState * state)
+{
+ stroke_color_changed = true;
+}
+void HTMLRenderer::clip(GfxState * state)
+{
+ clip_changed = true;
+ tracer.clip(state);
+}
+void HTMLRenderer::eoClip(GfxState * state)
+{
+ clip_changed = true;
+ tracer.clip(state, true);
+}
+void HTMLRenderer::clipToStrokePath(GfxState * state)
+{
+ clip_changed = true;
+ tracer.clip_to_stroke_path(state);
+}
+void HTMLRenderer::reset_state()
+{
+ draw_text_scale = 1.0;
+
+ cur_font_size = 0.0;
+
+ memcpy(cur_text_tm, ID_MATRIX, sizeof(cur_text_tm));
+
+ // reset html_state
+ cur_text_state.font_info = install_font(nullptr);
+ cur_text_state.font_size = 0;
+ cur_text_state.fill_color.transparent = true;
+ cur_text_state.stroke_color.transparent = true;
+ cur_text_state.letter_space = 0;
+ cur_text_state.word_space = 0;
+ cur_text_state.vertical_align = 0;
+
+ cur_line_state.x = 0;
+ cur_line_state.y = 0;
+ memcpy(cur_line_state.transform_matrix, ID_MATRIX, sizeof(cur_line_state.transform_matrix));
+
+ cur_line_state.is_char_covered = [this](int index) { return is_char_covered(index);};
+
+ cur_clip_state.xmin = 0;
+ cur_clip_state.xmax = 0;
+ cur_clip_state.ymin = 0;
+ cur_clip_state.ymax = 0;
+
+ cur_tx = cur_ty = 0;
+ draw_tx = draw_ty = 0;
+
+ reset_state_change();
+ all_changed = true;
+}
+void HTMLRenderer::reset_state_change()
+{
+ all_changed = false;
+
+ rise_changed = false;
+ text_pos_changed = false;
+
+ font_changed = false;
+ ctm_changed = false;
+ text_mat_changed = false;
+ hori_scale_changed = false;
+
+ letter_space_changed = false;
+ word_space_changed = false;
+
+ fill_color_changed = false;
+ stroke_color_changed = false;
+
+ clip_changed = false;
+}
+
+template<class NewLineState>
+void set_line_state(NewLineState & cur_ls, NewLineState new_ls)
+{
+ if(new_ls > cur_ls)
+ cur_ls = new_ls;
+}
+
+void HTMLRenderer::check_state_change(GfxState * state)
+{
+ // DEPENDENCY WARNING
+ // don't adjust the order of state checking
+
+ new_line_state = NLS_NONE;
+
+ if(all_changed || clip_changed)
+ {
+ HTMLClipState new_clip_state;
+ state->getClipBBox(&new_clip_state.xmin, &new_clip_state.ymin, &new_clip_state.xmax, &new_clip_state.ymax);
+ if(!(equal(cur_clip_state.xmin, new_clip_state.xmin)
+ && equal(cur_clip_state.xmax, new_clip_state.xmax)
+ && equal(cur_clip_state.ymin, new_clip_state.ymin)
+ && equal(cur_clip_state.ymax, new_clip_state.ymax)))
+ {
+ cur_clip_state = new_clip_state;
+ set_line_state(new_line_state, NLS_NEWCLIP);
+ }
+ }
+
+ bool need_recheck_position = false;
+ bool need_rescale_font = false;
+ bool draw_text_scale_changed = false;
+
+ // save current info for later use
+ auto old_text_state = cur_text_state;
+ auto old_line_state = cur_line_state;
+ double old_tm[6];
+ memcpy(old_tm, cur_text_tm, sizeof(old_tm));
+ double old_draw_text_scale = draw_text_scale;
+
+ // text position
+ // we've been tracking the text position positively in the update*** functions
+ if(all_changed || text_pos_changed)
+ {
+ need_recheck_position = true;
+ }
+
+ // font name & size
+ if(all_changed || font_changed)
+ {
+ const FontInfo * new_font_info = install_font(state->getFont());
+
+ if(!(new_font_info->id == cur_text_state.font_info->id))
+ {
+ // The width of the type 3 font text, if shown, is likely to be wrong
+ // So we will create separate (absolute positioned) blocks for them, such that it won't affect other text
+ if((new_font_info->is_type3 || cur_text_state.font_info->is_type3) && (!param.process_type3))
+ {
+ set_line_state(new_line_state, NLS_NEWLINE);
+ }
+ else
+ {
+ set_line_state(new_line_state, NLS_NEWSTATE);
+ }
+ cur_text_state.font_info = new_font_info;
+ }
+
+ /*
+ * For Type 3 fonts, we need to take type3_font_size_scale into consideration
+ */
+ if((new_font_info->is_type3 || cur_text_state.font_info->is_type3) && param.process_type3)
+ need_rescale_font = true;
+
+ double new_font_size = state->getFontSize();
+ if(!equal(cur_font_size, new_font_size))
+ {
+ need_rescale_font = true;
+ cur_font_size = new_font_size;
+ }
+ }
+
+ // ctm & text ctm & hori scale & rise
+ if(all_changed || ctm_changed || text_mat_changed || hori_scale_changed || rise_changed)
+ {
+ double new_text_tm[6];
+
+ double m1[6];
+ double m2[6];
+
+ //the matrix with horizontal_scale and rise
+ m1[0] = state->getHorizScaling();
+ m1[3] = 1;
+ m1[5] = state->getRise();
+ m1[1] = m1[2] = m1[4] = 0;
+
+ tm_multiply(m2, state->getCTM(), state->getTextMat());
+ tm_multiply(new_text_tm, m2, m1);
+
+ if(!tm_equal(new_text_tm, cur_text_tm))
+ {
+ need_recheck_position = true;
+ need_rescale_font = true;
+ memcpy(cur_text_tm, new_text_tm, sizeof(cur_text_tm));
+ }
+ }
+
+ // draw_text_tm, draw_text_scale
+ // depends: font size & ctm & text_ctm & hori scale & rise
+ if(need_rescale_font)
+ {
+ /*
+ * Rescale the font
+ * If the font-size is 1, and the matrix is [10,0,0,10,0,0], we would like to change it to
+ * font-size == 10 and matrix == [1,0,0,1,0,0],
+ * such that it will be easy and natural for web browsers
+ */
+ double new_draw_text_tm[6];
+ memcpy(new_draw_text_tm, cur_text_tm, sizeof(new_draw_text_tm));
+
+ // see how the tm (together with text_scale_factor2) would change the vector (0,1)
+ double new_draw_text_scale = 1.0/text_scale_factor2 * hypot(new_draw_text_tm[2], new_draw_text_tm[3]);
+
+ double new_draw_font_size = cur_font_size;
+
+ if(is_positive(new_draw_text_scale))
+ {
+ // scale both font size and matrix
+ new_draw_font_size *= new_draw_text_scale;
+ for(int i = 0; i < 4; ++i)
+ new_draw_text_tm[i] /= new_draw_text_scale;
+ }
+ else
+ {
+ new_draw_text_scale = 1.0;
+ }
+
+ if(is_positive(-new_draw_font_size))
+ {
+ // CSS cannot handle flipped pages
+ new_draw_font_size *= -1;
+
+ for(int i = 0; i < 4; ++i)
+ new_draw_text_tm[i] *= -1;
+ }
+
+ if(!(equal(new_draw_text_scale, draw_text_scale)))
+ {
+ draw_text_scale_changed = true;
+ draw_text_scale = new_draw_text_scale;
+ }
+
+ if(!equal(new_draw_font_size, cur_text_state.font_size))
+ {
+ set_line_state(new_line_state, NLS_NEWSTATE);
+ cur_text_state.font_size = new_draw_font_size;
+ }
+
+ if(!tm_equal(new_draw_text_tm, cur_line_state.transform_matrix, 4))
+ {
+ set_line_state(new_line_state, NLS_NEWLINE);
+ memcpy(cur_line_state.transform_matrix, new_draw_text_tm, sizeof(cur_line_state.transform_matrix));
+ }
+ }
+
+ // see if the new line is compatible with the current line with proper position shift
+ // don't bother doing the heavy job when (new_line_state == NLS_NEWLINE)
+ // depends: text position & transformation
+ if(need_recheck_position && (new_line_state < NLS_NEWLINE))
+ {
+ // TM[4] and/or TM[5] have been changed
+ // To find an offset (dx,dy), which would cancel the effect
+ /*
+ * CurTM * (cur_tx, cur_ty, 1)^T = OldTM * (draw_tx + dx, draw_ty + dy, 1)^T
+ *
+ * the first 4 elements of CurTM and OldTM should be the proportional
+ * otherwise the following text cannot be parallel
+ *
+ * NOTE:
+ * dx,dy are handled by the old state. so they should be multiplied by old_draw_text_scale
+ */
+
+ bool merged = false;
+ double dx = 0;
+ double dy = 0;
+ if(tm_equal(old_line_state.transform_matrix, cur_line_state.transform_matrix, 4))
+ {
+ double det = old_tm[0] * old_tm[3] - old_tm[1] * old_tm[2];
+ if(!equal(det, 0))
+ {
+ double lhs1 = cur_text_tm[0] * cur_tx + cur_text_tm[2] * cur_ty + cur_text_tm[4] - old_tm[0] * draw_tx - old_tm[2] * draw_ty - old_tm[4];
+ double lhs2 = cur_text_tm[1] * cur_tx + cur_text_tm[3] * cur_ty + cur_text_tm[5] - old_tm[1] * draw_tx - old_tm[3] * draw_ty - old_tm[5];
+ /*
+ * Now the equation system becomes
+ *
+ * lhs1 = OldTM[0] * dx + OldTM[2] * dy
+ * lhs2 = OldTM[1] * dx + OldTM[3] * dy
+ */
+
+ double inverted[4];
+ inverted[0] = old_tm[3] / det;
+ inverted[1] = -old_tm[1] / det;
+ inverted[2] = -old_tm[2] / det;
+ inverted[3] = old_tm[0] / det;
+ dx = inverted[0] * lhs1 + inverted[2] * lhs2;
+ dy = inverted[1] * lhs1 + inverted[3] * lhs2;
+ if(equal(dy, 0))
+ {
+ // text on a same horizontal line, we can insert positive or negative x-offsets
+ merged = true;
+ }
+ else if(param.optimize_text)
+ {
+ // otherwise we merge the lines only when
+ // - text are not shifted to the left too much
+ // - text are not moved too high or too low
+ if((dx * old_draw_text_scale) >= -param.space_threshold * old_text_state.em_size() - EPS)
+ {
+ double oldymin = old_text_state.font_info->descent * old_text_state.font_size;
+ double oldymax = old_text_state.font_info->ascent * old_text_state.font_size;
+ double ymin = dy * old_draw_text_scale + cur_text_state.font_info->descent * cur_text_state.font_size;
+ double ymax = dy * old_draw_text_scale + cur_text_state.font_info->ascent * cur_text_state.font_size;
+ if((ymin <= oldymax + EPS) && (ymax >= oldymin - EPS))
+ {
+ merged = true;
+ }
+ }
+ }
+ }
+ //else no solution
+ }
+ // else: different rotation: force new line
+
+ if(merged && !equal(state->getHorizScaling(), 0))
+ {
+ html_text_page.get_cur_line()->append_offset(dx * old_draw_text_scale / state->getHorizScaling());
+ if(equal(dy, 0))
+ {
+ cur_text_state.vertical_align = 0;
+ }
+ else
+ {
+ cur_text_state.vertical_align = (dy * old_draw_text_scale);
+ set_line_state(new_line_state, NLS_NEWSTATE);
+ }
+ draw_tx = cur_tx;
+ draw_ty = cur_ty;
+ }
+ else
+ {
+ set_line_state(new_line_state, NLS_NEWLINE);
+ }
+ }
+ else
+ {
+ // no vertical shift if no need to check position
+ cur_text_state.vertical_align = 0;
+ }
+
+ // letter space
+ // depends: draw_text_scale
+ if(all_changed || letter_space_changed || draw_text_scale_changed)
+ {
+ double new_letter_space = state->getCharSpace() * draw_text_scale;
+ if(!equal(new_letter_space, cur_text_state.letter_space))
+ {
+ cur_text_state.letter_space = new_letter_space;
+ set_line_state(new_line_state, NLS_NEWSTATE);
+ }
+ }
+
+ // word space
+ // depends draw_text_scale
+ if(all_changed || word_space_changed || draw_text_scale_changed)
+ {
+ double new_word_space = state->getWordSpace() * draw_text_scale;
+ if(!equal(new_word_space, cur_text_state.word_space))
+ {
+ cur_text_state.word_space = new_word_space;
+ set_line_state(new_line_state, NLS_NEWSTATE);
+ }
+ }
+
+ // fill color
+ if((!(param.fallback)) && (all_changed || fill_color_changed))
+ {
+ // * PDF Spec. Table 106 –Text rendering modes
+ static const char FILL[8] = { true, false, true, false, true, false, true, false };
+
+ int idx = state->getRender();
+ assert((idx >= 0) && (idx < 8));
+ Color new_fill_color;
+ if(FILL[idx])
+ {
+ new_fill_color.transparent = false;
+ state->getFillRGB(&new_fill_color.rgb);
+ }
+ else
+ {
+ new_fill_color.transparent = true;
+ }
+ if(!(new_fill_color == cur_text_state.fill_color))
+ {
+ cur_text_state.fill_color = new_fill_color;
+ set_line_state(new_line_state, NLS_NEWSTATE);
+ }
+ }
+
+ // stroke color
+ if((!(param.fallback)) && (all_changed || stroke_color_changed))
+ {
+ // * PDF Spec. Table 106 – Text rendering modes
+ static const char STROKE[8] = { false, true, true, false, false, true, true, false };
+
+ int idx = state->getRender();
+ assert((idx >= 0) && (idx < 8));
+ Color new_stroke_color;
+ // stroke
+ if(STROKE[idx])
+ {
+ new_stroke_color.transparent = false;
+ state->getStrokeRGB(&new_stroke_color.rgb);
+ }
+ else
+ {
+ new_stroke_color.transparent = true;
+ }
+ if(!(new_stroke_color == cur_text_state.stroke_color))
+ {
+ cur_text_state.stroke_color = new_stroke_color;
+ set_line_state(new_line_state, NLS_NEWSTATE);
+ }
+ }
+
+ reset_state_change();
+}
+
+void HTMLRenderer::prepare_text_line(GfxState * state)
+{
+ if(!(html_text_page.get_cur_line()))
+ new_line_state = NLS_NEWCLIP;
+
+ if(new_line_state >= NLS_NEWCLIP)
+ {
+ html_text_page.clip(cur_clip_state);
+ }
+
+ if(new_line_state >= NLS_NEWLINE)
+ {
+ // update position such that they will be recorded by text_line_buf
+ double rise_x, rise_y;
+ state->textTransformDelta(0, state->getRise(), &rise_x, &rise_y);
+ state->transform(state->getCurX() + rise_x, state->getCurY() + rise_y, &cur_line_state.x, &cur_line_state.y);
+
+ if (param.correct_text_visibility)
+ cur_line_state.first_char_index = get_char_count();
+
+ html_text_page.open_new_line(cur_line_state);
+
+ cur_text_state.vertical_align = 0;
+
+ //resync position
+ draw_ty = cur_ty;
+ draw_tx = cur_tx;
+ }
+ else
+ {
+ // align horizontal position
+ // try to merge with the last line if possible
+ double target = (cur_tx - draw_tx) * draw_text_scale;
+ if(!equal(target, 0))
+ {
+ html_text_page.get_cur_line()->append_offset(target);
+ draw_tx += target / draw_text_scale;
+ }
+ }
+
+ if(new_line_state != NLS_NONE)
+ {
+ html_text_page.get_cur_line()->append_state(cur_text_state);
+ }
+}
+
+} //namespace pdf2htmlEX
diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc
new file mode 100644
index 0000000..e58a17a
--- /dev/null
+++ b/src/HTMLRenderer/text.cc
@@ -0,0 +1,166 @@
+/*
+ * text.cc
+ *
+ * Handling text & font, and relative stuffs
+ *
+ * Copyright (C) 2012 Lu Wang <coolwanglu@gmail.com>
+ */
+
+
+#include <algorithm>
+
+#include "HTMLRenderer.h"
+
+#include "util/namespace.h"
+#include "util/unicode.h"
+
+//#define HR_DEBUG(x) (x)
+#define HR_DEBUG(x)
+
+namespace pdf2htmlEX {
+
+using std::none_of;
+using std::cerr;
+using std::endl;
+
+void HTMLRenderer::drawString(GfxState * state, GooString * s)
+{
+ if(s->getLength() == 0)
+ return;
+
+ auto font = state->getFont();
+ double cur_letter_space = state->getCharSpace();
+ double cur_word_space = state->getWordSpace();
+ double cur_horiz_scaling = state->getHorizScaling();
+
+
+ // Writing mode fonts and Type 3 fonts are rendered as images
+ // I don't find a way to display writing mode fonts in HTML except for one div for each character, which is too costly
+ // For type 3 fonts, due to the font matrix, still it's hard to show it on HTML
+ if( (font == nullptr)
+ || (font->getWMode())
+ || ((font->getType() == fontType3) && (!param.process_type3))
+ )
+ {
+ return;
+ }
+
+ // see if the line has to be closed due to state change
+ check_state_change(state);
+ prepare_text_line(state);
+
+ // Now ready to output
+ // get the unicodes
+ char *p = s->getCString();
+ int len = s->getLength();
+
+ //accumulated displacement of chars in this string, in text object space
+ double dx = 0;
+ double dy = 0;
+ //displacement of current char, in text object space, including letter space but not word space.
+ double ddx, ddy;
+ //advance of current char, in glyph space
+ double ax, ay;
+ //origin of current char, in glyph space
+ double ox, oy;
+
+ int uLen;
+
+ CharCode code;
+ Unicode *u = nullptr;
+
+ HR_DEBUG(printf("HTMLRenderer::drawString:len=%d\n", len));
+
+ while (len > 0)
+ {
+ auto n = font->getNextChar(p, len, &code, &u, &uLen, &ax, &ay, &ox, &oy);
+ HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%lc(%d)\n", (wchar_t)u[0], u[0]));
+
+ if(!(equal(ox, 0) && equal(oy, 0)))
+ {
+ cerr << "TODO: non-zero origins" << endl;
+ }
+ ddx = ax * cur_font_size + cur_letter_space;
+ ddy = ay * cur_font_size;
+ tracer.draw_char(state, dx, dy, ax, ay);
+
+ bool is_space = false;
+ if (n == 1 && *p == ' ')
+ {
+ /*
+ * This is by standard
+ * however some PDF will use ' ' as a normal encoding slot
+ * such that it will be mapped to other unicodes
+ * In that case, when space_as_offset is on, we will simply ignore that character...
+ *
+ * Checking mapped unicode may or may not work
+ * There are always ugly PDF files with no useful info at all.
+ */
+ is_space = true;
+ }
+
+ if(is_space && (param.space_as_offset))
+ {
+ html_text_page.get_cur_line()->append_padding_char();
+ // ignore horiz_scaling, as it has been merged into CTM
+ html_text_page.get_cur_line()->append_offset((ax * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale);
+ }
+ else
+ {
+ if((param.decompose_ligature) && (uLen > 1) && none_of(u, u+uLen, is_illegal_unicode))
+ {
+ html_text_page.get_cur_line()->append_unicodes(u, uLen, ddx);
+ }
+ else
+ {
+ Unicode uu;
+ if(cur_text_state.font_info->use_tounicode)
+ {
+ uu = check_unicode(u, uLen, code, font);
+ }
+ else
+ {
+ uu = unicode_from_font(code, font);
+ }
+ html_text_page.get_cur_line()->append_unicodes(&uu, 1, ddx);
+ /*
+ * In PDF, word_space is appended if (n == 1 and *p = ' ')
+ * but in HTML, word_space is appended if (uu == ' ')
+ */
+ int space_count = (is_space ? 1 : 0) - ((uu == ' ') ? 1 : 0);
+ if(space_count != 0)
+ {
+ html_text_page.get_cur_line()->append_offset(cur_word_space * draw_text_scale * space_count);
+ }
+ }
+ }
+
+ dx += ddx * cur_horiz_scaling;
+ dy += ddy;
+ if (is_space)
+ dx += cur_word_space * cur_horiz_scaling;
+
+ p += n;
+ len -= n;
+ }
+
+ cur_tx += dx;
+ cur_ty += dy;
+
+ draw_tx += dx;
+ draw_ty += dy;
+}
+
+bool HTMLRenderer::is_char_covered(int index)
+{
+ auto covered = covered_text_detector.get_chars_covered();
+ if (index < 0 || index >= (int)covered.size())
+ {
+ std::cerr << "Warning: HTMLRenderer::is_char_covered: index out of bound: "
+ << index << ", size: " << covered.size() <<endl;
+ return false;
+ }
+ return covered[index];
+}
+
+} // namespace pdf2htmlEX
diff --git a/src/HTMLState.h b/src/HTMLState.h
new file mode 100644
index 0000000..ef7e29f
--- /dev/null
+++ b/src/HTMLState.h
@@ -0,0 +1,82 @@
+/*
+ * Header file for HTMLState
+ * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com>
+ */
+#ifndef HTMLSTATE_H__
+#define HTMLSTATE_H__
+
+#include <functional>
+
+#include "Color.h"
+
+namespace pdf2htmlEX {
+
+struct FontInfo
+{
+ long long id;
+ bool use_tounicode;
+ int em_size;
+ double space_width;
+ double ascent, descent;
+ bool is_type3;
+ /*
+ * As Type 3 fonts have a font matrix
+ * a glyph of 1pt can be very large or very small
+ * however it might not be true for other font formats such as ttf
+ *
+ * Therefore when we save a Type 3 font into ttf,
+ * we have to scale the font to about 1,
+ * then apply the scaling when using the font
+ *
+ * The scaling factor is stored as font_size_scale
+ *
+ * The value is 1 for other fonts
+ */
+ double font_size_scale;
+};
+
+struct HTMLTextState
+{
+ const FontInfo * font_info;
+ double font_size;
+ Color fill_color;
+ Color stroke_color;
+ double letter_space;
+ double word_space;
+
+ // relative to the previous state
+ double vertical_align;
+
+ // the offset cause by a single ' ' char
+ double single_space_offset(void) const {
+ double offset = word_space + letter_space;
+ if(font_info->em_size != 0)
+ offset += font_info->space_width * font_size;
+ return offset;
+ }
+ // calculate em_size of this state
+ double em_size(void) const {
+ return font_size * (font_info->ascent - font_info->descent);
+ }
+};
+
+struct HTMLLineState
+{
+ double x,y;
+ double transform_matrix[4];
+ // The page-cope char index(in drawing order) of the first char in this line.
+ int first_char_index;
+ // A function to determine whether a char is covered at a given index.
+ std::function<bool(int)> is_char_covered;
+
+ HTMLLineState(): first_char_index(-1) { }
+};
+
+struct HTMLClipState
+{
+ double xmin, xmax, ymin, ymax;
+};
+
+} // namespace pdf2htmlEX
+
+#endif //HTMLSTATE_H__
diff --git a/src/HTMLTextLine.cc b/src/HTMLTextLine.cc
new file mode 100644
index 0000000..a0be286
--- /dev/null
+++ b/src/HTMLTextLine.cc
@@ -0,0 +1,734 @@
+/*
+ * HTMLTextLine.cc
+ *
+ * Generate and optimized HTML for one line
+ *
+ * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#include <cmath>
+#include <algorithm>
+
+#include "HTMLTextLine.h"
+
+#include "util/encoding.h"
+#include "util/css_const.h"
+
+namespace pdf2htmlEX {
+
+using std::min;
+using std::max;
+using std::vector;
+using std::ostream;
+using std::cerr;
+using std::endl;
+using std::find;
+using std::abs;
+
+HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & param, AllStateManager & all_manager)
+ :param(param)
+ ,all_manager(all_manager)
+ ,line_state(line_state)
+ ,clip_x1(0)
+ ,clip_y1(0)
+ ,width(0)
+{ }
+
+void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width)
+{
+ if (l == 1)
+ text.push_back(min(u[0], (unsigned)INT_MAX));
+ else if (l > 1)
+ {
+ text.push_back(- decomposed_text.size() - 1);
+ decomposed_text.emplace_back();
+ decomposed_text.back().assign(u, u + l);
+ }
+ this->width += width;
+}
+
+void HTMLTextLine::append_offset(double width)
+{
+ /*
+ * If the last offset is very thin, we can ignore it and directly use it
+ * But this should not happen often, and we will also filter near-zero offsets when outputting them
+ * So don't check it.
+ *
+ * Offset must be appended immediately after the last real (non-padding) char, or the text optimizing
+ * algorithm may be confused: it may wrongly convert offsets at the beginning of a line to word-space.
+ */
+
+ auto offset_idx = text.size();
+ while (offset_idx > 0 && text[offset_idx - 1] == 0)
+ --offset_idx;
+ if((!offsets.empty()) && (offsets.back().start_idx == offset_idx))
+ offsets.back().width += width;
+ else
+ offsets.emplace_back(offset_idx, width);
+ this->width += width;
+}
+
+void HTMLTextLine::append_state(const HTMLTextState & text_state)
+{
+ if(states.empty() || (states.back().start_idx != text.size()))
+ {
+ states.emplace_back();
+ states.back().start_idx = text.size();
+ states.back().hash_umask = 0;
+ }
+
+ HTMLTextState & last_state = states.back();
+ last_state = text_state;
+ //apply font scale
+ last_state.font_size *= last_state.font_info->font_size_scale;
+}
+
+void HTMLTextLine::dump_char(std::ostream & out, int pos)
+{
+ int c = text[pos];
+ if (c > 0)
+ {
+ Unicode u = c;
+ writeUnicodes(out, &u, 1);
+ }
+ else if (c < 0)
+ {
+ auto dt = decomposed_text[- c - 1];
+ writeUnicodes(out, &dt.front(), dt.size());
+ }
+}
+
+void HTMLTextLine::dump_chars(ostream & out, int begin, int len)
+{
+ static const Color transparent(0, 0, 0, true);
+
+ if (line_state.first_char_index < 0)
+ {
+ for (int i = 0; i < len; i++)
+ dump_char(out, begin + i);
+ return;
+ }
+
+ bool invisible_group_open = false;
+ for(int i = 0; i < len; i++)
+ {
+ if (!line_state.is_char_covered(line_state.first_char_index + begin + i)) //visible
+ {
+ if (invisible_group_open)
+ {
+ invisible_group_open = false;
+ out << "</span>";
+ }
+ dump_char(out, begin + i);
+ }
+ else
+ {
+ if (!invisible_group_open)
+ {
+ out << "<span class=\"" << all_manager.fill_color.get_css_class_name()
+ << all_manager.fill_color.install(transparent) << " " << all_manager.stroke_color.get_css_class_name()
+ << all_manager.stroke_color.install(transparent) << "\">";
+ invisible_group_open = true;
+ }
+ dump_char(out, begin + i);
+ }
+ }
+ if (invisible_group_open)
+ out << "</span>";
+}
+
+void HTMLTextLine::dump_text(ostream & out)
+{
+ /*
+ * Each Line is an independent absolute positioned block
+ * so even we have a few states or offsets, we may omit them
+ */
+ if(text.empty())
+ return;
+
+ if(states.empty() || (states[0].start_idx != 0))
+ {
+ cerr << "Warning: text without a style! Must be a bug in pdf2htmlEX" << endl;
+ return;
+ }
+
+ // Start Output
+ {
+ // open <div> for the current text line
+ out << "<div class=\"" << CSS::LINE_CN
+ << " " << CSS::TRANSFORM_MATRIX_CN << all_manager.transform_matrix.install(line_state.transform_matrix)
+ << " " << CSS::LEFT_CN << all_manager.left.install(line_state.x - clip_x1)
+ << " " << CSS::HEIGHT_CN << all_manager.height.install(ascent)
+ << " " << CSS::BOTTOM_CN << all_manager.bottom.install(line_state.y - clip_y1)
+ ;
+ // it will be closed by the first state
+ }
+
+ std::vector<State*> stack;
+ // a special safeguard in the bottom
+ stack.push_back(nullptr);
+
+ //accumulated horizontal offset;
+ double dx = 0;
+
+ // whenever a negative offset appears, we should not pop out that <span>
+ // otherwise the effect of negative margin-left would disappear
+ size_t last_text_pos_with_negative_offset = 0;
+ size_t cur_text_idx = 0;
+
+ auto cur_offset_iter = offsets.begin();
+ for(auto state_iter2 = states.begin(), state_iter1 = state_iter2++;
+ state_iter1 != states.end();
+ ++state_iter1, ++state_iter2)
+ {
+ // export current state, find a closest parent
+ {
+ // greedy
+ double vertical_align = state_iter1->vertical_align;
+ int best_cost = State::HASH_ID_COUNT + 1;
+ // we have a nullptr at the beginning, so no need to check for rend
+ for(auto iter = stack.rbegin(); *iter; ++iter)
+ {
+ int cost = state_iter1->diff(**iter);
+ if(!equal(vertical_align,0))
+ ++cost;
+
+ if(cost < best_cost)
+ {
+ while(stack.back() != *iter)
+ {
+ stack.back()->end(out);
+ stack.pop_back();
+ }
+ best_cost = cost;
+ state_iter1->vertical_align = vertical_align;
+
+ if(best_cost == 0)
+ break;
+ }
+
+ // cannot go further
+ if((*iter)->start_idx <= last_text_pos_with_negative_offset)
+ break;
+
+ vertical_align += (*iter)->vertical_align;
+ }
+ //
+ state_iter1->ids[State::VERTICAL_ALIGN_ID] = all_manager.vertical_align.install(state_iter1->vertical_align);
+ // export the diff between *state_iter1 and stack.back()
+ state_iter1->begin(out, stack.back());
+ stack.push_back(&*state_iter1);
+ }
+
+ // [state_iter1->start_idx, text_idx2) are covered by the current state
+ size_t text_idx2 = (state_iter2 == states.end()) ? text.size() : state_iter2->start_idx;
+
+ // dump all text and offsets before next state
+ while(true)
+ {
+ if((cur_offset_iter != offsets.end())
+ && (cur_offset_iter->start_idx <= cur_text_idx))
+ {
+ if(cur_offset_iter->start_idx > text_idx2)
+ break;
+ // next is offset
+ double target = cur_offset_iter->width + dx;
+ double actual_offset = 0;
+
+ //ignore near-zero offsets
+ if(std::abs(target) <= param.h_eps)
+ {
+ actual_offset = 0;
+ }
+ else
+ {
+ bool done = false;
+ // check if the offset is equivalent to a single ' '
+ if(!(state_iter1->hash_umask & State::umask_by_id(State::WORD_SPACE_ID)))
+ {
+ double space_off = state_iter1->single_space_offset();
+ if(std::abs(target - space_off) <= param.h_eps)
+ {
+ Unicode u = ' ';
+ writeUnicodes(out, &u, 1);
+ actual_offset = space_off;
+ done = true;
+ }
+ }
+
+ // finally, just dump it
+ if(!done)
+ {
+ long long wid = all_manager.whitespace.install(target, &actual_offset);
+
+ if(!equal(actual_offset, 0))
+ {
+ if(is_positive(-actual_offset))
+ last_text_pos_with_negative_offset = cur_text_idx;
+
+ double threshold = state_iter1->em_size() * (param.space_threshold);
+
+ out << "<span class=\"" << CSS::WHITESPACE_CN
+ << ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";
+ }
+ }
+ }
+ dx = target - actual_offset;
+ ++ cur_offset_iter;
+ }
+ else
+ {
+ if(cur_text_idx >= text_idx2)
+ break;
+ // next is text
+ size_t next_text_idx = text_idx2;
+ if((cur_offset_iter != offsets.end()) && (cur_offset_iter->start_idx) < next_text_idx)
+ next_text_idx = cur_offset_iter->start_idx;
+ dump_chars(out, cur_text_idx, next_text_idx - cur_text_idx);
+ cur_text_idx = next_text_idx;
+ }
+ }
+ }
+
+ // we have a nullptr in the bottom
+ while(stack.back())
+ {
+ stack.back()->end(out);
+ stack.pop_back();
+ }
+
+ out << "</div>";
+}
+
+void HTMLTextLine::clear(void)
+{
+ states.clear();
+ offsets.clear();
+ text.clear();
+}
+
+void HTMLTextLine::clip(const HTMLClipState & clip_state)
+{
+ clip_x1 = clip_state.xmin;
+ clip_y1 = clip_state.ymin;
+}
+
+void HTMLTextLine::prepare(void)
+{
+ // max_ascent determines the height of the div
+ double accum_vertical_align = 0; // accumulated
+ ascent = 0;
+ descent = 0;
+ // note that vertical_align cannot be calculated here
+ for(auto iter = states.begin(); iter != states.end(); ++iter)
+ {
+ auto font_info = iter->font_info;
+ iter->ids[State::FONT_ID] = font_info->id;
+ iter->ids[State::FONT_SIZE_ID] = all_manager.font_size.install(iter->font_size);
+ iter->ids[State::FILL_COLOR_ID] = all_manager.fill_color.install(iter->fill_color);
+ iter->ids[State::STROKE_COLOR_ID] = all_manager.stroke_color.install(iter->stroke_color);
+ iter->ids[State::LETTER_SPACE_ID] = all_manager.letter_space.install(iter->letter_space);
+ iter->ids[State::WORD_SPACE_ID] = all_manager.word_space.install(iter->word_space);
+ iter->hash();
+
+ accum_vertical_align += iter->vertical_align;
+ double cur_ascent = accum_vertical_align + font_info->ascent * iter->font_size;
+ if(cur_ascent > ascent)
+ ascent = cur_ascent;
+ double cur_descent = accum_vertical_align + font_info->descent * iter->font_size;
+ if(cur_descent < descent)
+ descent = cur_descent;
+ }
+}
+
+
+void HTMLTextLine::optimize(std::vector<HTMLTextLine*> & lines)
+{
+ if(param.optimize_text == 3)
+ {
+ optimize_aggressive(lines);
+ }
+ else
+ {
+ optimize_normal(lines);
+ }
+}
+/*
+ * Adjust letter space and word space in order to reduce the number of HTML elements
+ * May also unmask word space
+ */
+void HTMLTextLine::optimize_normal(std::vector<HTMLTextLine*> & lines)
+{
+ // remove useless states in the end
+ while((!states.empty()) && (states.back().start_idx >= text.size()))
+ states.pop_back();
+
+ assert(!states.empty());
+
+ const long long word_space_umask = State::umask_by_id(State::WORD_SPACE_ID);
+
+ // for optimization, we need accurate values
+ auto & ls_manager = all_manager.letter_space;
+ auto & ws_manager = all_manager.word_space;
+
+ // statistics of widths
+ std::map<double, size_t> width_map;
+ // store optimized offsets
+ std::vector<Offset> new_offsets;
+ new_offsets.reserve(offsets.size());
+
+ auto offset_iter1 = offsets.begin();
+ for(auto state_iter2 = states.begin(), state_iter1 = state_iter2++;
+ state_iter1 != states.end();
+ ++state_iter1, ++state_iter2)
+ {
+ const size_t text_idx1 = state_iter1->start_idx;
+ const size_t text_idx2 = (state_iter2 == states.end()) ? text.size() : state_iter2->start_idx;
+ size_t text_count = text_idx2 - text_idx1;
+
+ // there might be some offsets before the first state
+ while((offset_iter1 != offsets.end())
+ && (offset_iter1->start_idx <= text_idx1))
+ {
+ new_offsets.push_back(*(offset_iter1++));
+ }
+
+ // find the last offset covered by the current state
+ auto offset_iter2 = offset_iter1;
+ for(; (offset_iter2 != offsets.end()) && (offset_iter2->start_idx <= text_idx2); ++offset_iter2) { }
+
+ // There are `offset_count` <span>'s, the target is to reduce this number
+ size_t offset_count = offset_iter2 - offset_iter1;
+ assert(text_count >= offset_count);
+
+ // Optimize letter space
+ // how much letter_space is changed
+ // will be later used for optimizing word space
+ double letter_space_diff = 0;
+ width_map.clear();
+
+ // In some PDF files all letter spaces are implemented as position shifts between each letter
+ // try to simplify it with a proper letter space
+ if(offset_count > 0)
+ {
+ // mark the current letter_space
+ if(text_count > offset_count)
+ width_map.insert(std::make_pair(0, text_count - offset_count));
+
+ for(auto off_iter = offset_iter1; off_iter != offset_iter2; ++off_iter)
+ {
+ const double target = off_iter->width;
+ auto iter = width_map.lower_bound(target-EPS);
+ if((iter != width_map.end()) && (std::abs(iter->first - target) <= EPS))
+ {
+ ++ iter->second;
+ }
+ else
+ {
+ width_map.insert(iter, std::make_pair(target, 1));
+ }
+ }
+
+ // TODO snapping the widths may result a better result
+ // e.g. for (-0.7 0.6 -0.2 0.3 10 10), 0 is better than 10
+ double most_used_width = 0;
+ size_t max_count = 0;
+ for(auto iter = width_map.begin(); iter != width_map.end(); ++iter)
+ {
+ if(iter->second > max_count)
+ {
+ most_used_width = iter->first;
+ max_count = iter->second;
+ }
+ }
+
+ // negative letter space may cause problems
+ if((max_count <= text_count / 2) || (!is_positive(state_iter1->letter_space + most_used_width)))
+ {
+ // the old value is the best
+ // just copy old offsets
+ new_offsets.insert(new_offsets.end(), offset_iter1, offset_iter2);
+ }
+ else
+ {
+ // now we would like to adjust letter space to most_used width
+
+ // install new letter space
+ const double old_ls = state_iter1->letter_space;
+ state_iter1->ids[State::LETTER_SPACE_ID] = ls_manager.install(old_ls + most_used_width, &(state_iter1->letter_space));
+ letter_space_diff = old_ls - state_iter1->letter_space;
+ // update offsets
+ auto off_iter = offset_iter1;
+ // re-count number of offsets
+ offset_count = 0;
+ for(size_t cur_text_idx = text_idx1; cur_text_idx < text_idx2; ++cur_text_idx)
+ {
+ double cur_width = 0;
+ if((off_iter != offset_iter2) && (off_iter->start_idx == cur_text_idx + 1))
+ {
+ cur_width = off_iter->width + letter_space_diff;
+ ++off_iter;
+ }
+ else
+ {
+ cur_width = letter_space_diff ;
+ }
+ if(!equal(cur_width, 0))
+ {
+ new_offsets.emplace_back(cur_text_idx+1, cur_width);
+ ++ offset_count;
+ }
+ }
+ }
+ }
+
+ // Optimize word space
+
+ // In some PDF files all spaces are converted into positioning shift
+ // We may try to change (some of) them to ' ' by adjusting word_space
+ // for now, we consider only the no-space scenario
+ // which also includes the case when param.space_as_offset is set
+
+ // get the text segment covered by current state (*state_iter1)
+ const auto text_iter1 = text.begin() + text_idx1;
+ const auto text_iter2 = text.begin() + text_idx2;
+ if(find(text_iter1, text_iter2, ' ') == text_iter2)
+ {
+ // if there is not any space, we may change the value of word_space arbitrarily
+ // note that we may only change word space, no offset will be affected
+ // The actual effect will emerge during flushing, where it could be detected that an offset can be optimized as a single space character
+
+ if(offset_count > 0)
+ {
+ double threshold = (state_iter1->em_size()) * (param.space_threshold);
+ // set word_space for the most frequently used offset
+ double most_used_width = 0;
+ size_t max_count = 0;
+
+ // if offset_count > 0, we must have updated width_map in the previous step
+ // find the most frequent width, with new letter space applied
+ for(auto iter = width_map.begin(); iter != width_map.end(); ++iter)
+ {
+ double fixed_width = iter->first + letter_space_diff; // this is the actual offset in HTML
+ // we don't want to add spaces for tiny gaps, or even negative shifts
+ if((fixed_width >= threshold - EPS) && (iter->second > max_count))
+ {
+ max_count = iter->second;
+ most_used_width = fixed_width;
+ }
+ }
+
+ state_iter1->word_space = 0; // clear word_space for single_space_offset
+ double new_word_space = most_used_width - state_iter1->single_space_offset();
+ state_iter1->ids[State::WORD_SPACE_ID] = ws_manager.install(new_word_space, &(state_iter1->word_space)); // install new word_space
+ state_iter1->hash_umask &= (~word_space_umask); // mark that the word_space is not free
+ }
+ else // there is no offset at all
+ {
+ state_iter1->hash_umask |= word_space_umask; // we just free word_space
+ }
+ }
+ offset_iter1 = offset_iter2;
+ }
+
+ // apply optimization
+ std::swap(offsets, new_offsets);
+
+ lines.push_back(this);
+}
+
+// for optimize-text == 3
+void HTMLTextLine::optimize_aggressive(std::vector<HTMLTextLine*> & lines)
+{
+ /*
+ HTMLLineState original_line_state = line_state;
+ // break the line if there are a large (positive or negative) shift
+ // letter space / word space are not taken into consideration (yet)
+ while(true)
+ {
+ }
+
+ // aggressive optimization
+ if(target > state_iter1->em_size() * (param.space_threshold) - EPS)
+ out << ' ';
+ dx = 0;
+ lines.push_back(this);
+ */
+}
+
+// this state will be converted to a child node of the node of prev_state
+// dump the difference between previous state
+// also clone corresponding states
+void HTMLTextLine::State::begin (ostream & out, const State * prev_state)
+{
+ if(prev_state)
+ {
+ long long cur_mask = 0xff;
+ bool first = true;
+ for(int i = 0; i < HASH_ID_COUNT; ++i, cur_mask<<=8)
+ {
+ if(hash_umask & cur_mask) // we don't care about this ID
+ {
+ if (prev_state->hash_umask & cur_mask) // if prev_state do not care about it either
+ continue;
+
+ // otherwise
+ // we have to inherit it
+ ids[i] = prev_state->ids[i];
+ hash_umask &= (~cur_mask);
+ //copy the corresponding value
+ //TODO: this is so ugly
+ switch(i)
+ {
+ case FONT_SIZE_ID:
+ font_size = prev_state->font_size;
+ break;
+ case LETTER_SPACE_ID:
+ letter_space = prev_state->letter_space;
+ break;
+ case WORD_SPACE_ID:
+ word_space = prev_state->word_space;
+ break;
+ default:
+ cerr << "unexpected state mask" << endl;
+ break;
+ }
+ }
+
+ // now we care about the ID
+
+ // if the value from prev_state is the same, we don't need to dump it
+ if((!(prev_state->hash_umask & cur_mask)) && (prev_state->ids[i] == ids[i]))
+ continue;
+
+ // so we have to dump it
+ if(first)
+ {
+ out << "<span class=\"";
+ first = false;
+ }
+ else
+ {
+ out << ' ';
+ }
+
+ // out should have hex set
+ out << css_class_names[i];
+ if (ids[i] == -1)
+ out << CSS::INVALID_ID;
+ else
+ out << ids[i];
+ }
+ // vertical align
+ if(!equal(vertical_align, 0))
+ {
+ // so we have to dump it
+ if(first)
+ {
+ out << "<span class=\"";
+ first = false;
+ }
+ else
+ {
+ out << ' ';
+ }
+
+ // out should have hex set
+ out << CSS::VERTICAL_ALIGN_CN;
+ auto id = ids[VERTICAL_ALIGN_ID];
+ if (id == -1)
+ out << CSS::INVALID_ID;
+ else
+ out << id;
+ }
+
+ if(first) // we actually just inherit the whole prev_state
+ {
+ need_close = false;
+ }
+ else
+ {
+ out << "\">";
+ need_close = true;
+ }
+ }
+ else
+ {
+ // prev_state == nullptr
+ // which means this is the first state of the line
+ // there should be a open pending <div> left there
+ // it is not necessary to output vertical align
+ long long cur_mask = 0xff;
+ for(int i = 0; i < HASH_ID_COUNT; ++i, cur_mask<<=8)
+ {
+ if(hash_umask & cur_mask) // we don't care about this ID
+ continue;
+
+ // now we care about the ID
+ out << ' ';
+ // out should have hex set
+ out << css_class_names[i];
+ if (ids[i] == -1)
+ out << CSS::INVALID_ID;
+ else
+ out << ids[i];
+ }
+
+ out << "\">";
+ need_close = false;
+ }
+}
+
+void HTMLTextLine::State::end(ostream & out) const
+{
+ if(need_close)
+ out << "</span>";
+}
+
+void HTMLTextLine::State::hash(void)
+{
+ hash_value = 0;
+ for(int i = 0; i < ID_COUNT; ++i)
+ {
+ hash_value = (hash_value << 8) | (ids[i] & 0xff);
+ }
+}
+
+int HTMLTextLine::State::diff(const State & s) const
+{
+ /*
+ * A quick check based on hash_value
+ * it could be wrong when there are more then 256 classes,
+ * in which case the output may not be optimal, but still 'correct' in terms of HTML
+ */
+ long long common_mask = ~(hash_umask | s.hash_umask);
+ if((hash_value & common_mask) == (s.hash_value & common_mask)) return 0;
+
+ long long cur_mask = 0xff;
+ int d = 0;
+ for(int i = 0; i < ID_COUNT; ++i)
+ {
+ if((common_mask & cur_mask) && (ids[i] != s.ids[i]))
+ ++ d;
+ cur_mask <<= 8;
+ }
+ return d;
+}
+
+long long HTMLTextLine::State::umask_by_id(int id)
+{
+ return (((long long)0xff) << (8*id));
+}
+
+// the order should be the same as in the enum
+const char * const HTMLTextLine::State::css_class_names [] = {
+ CSS::FONT_FAMILY_CN,
+ CSS::FONT_SIZE_CN,
+ CSS::FILL_COLOR_CN,
+ CSS::STROKE_COLOR_CN,
+ CSS::LETTER_SPACE_CN,
+ CSS::WORD_SPACE_CN,
+ CSS::VERTICAL_ALIGN_CN,
+};
+
+} //namespace pdf2htmlEX
diff --git a/src/HTMLTextLine.h b/src/HTMLTextLine.h
new file mode 100644
index 0000000..fcce811
--- /dev/null
+++ b/src/HTMLTextLine.h
@@ -0,0 +1,134 @@
+/*
+ * Header file for HTMLTextLine
+ * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com>
+ */
+#ifndef HTMLTEXTLINE_H__
+#define HTMLTEXTLINE_H__
+
+#include <ostream>
+#include <vector>
+
+#include <CharTypes.h>
+
+#include "Param.h"
+#include "StateManager.h"
+#include "HTMLState.h"
+
+namespace pdf2htmlEX {
+
+/*
+ * Store and optimize a line of text in HTML
+ *
+ * contains a series of
+ * - Text
+ * - Shift
+ * - State change
+ */
+class HTMLTextLine
+{
+public:
+ HTMLTextLine (const HTMLLineState & line_state, const Param & param, AllStateManager & all_manager);
+
+ struct State : public HTMLTextState {
+ // before output
+ void begin(std::ostream & out, const State * prev_state);
+ // after output
+ void end(std::ostream & out) const;
+ // calculate the hash code
+ void hash(void);
+ // calculate the difference between another State
+ int diff(const State & s) const;
+
+ enum {
+ FONT_ID,
+ FONT_SIZE_ID,
+ FILL_COLOR_ID,
+ STROKE_COLOR_ID,
+ LETTER_SPACE_ID,
+ WORD_SPACE_ID,
+ HASH_ID_COUNT,
+
+ VERTICAL_ALIGN_ID = HASH_ID_COUNT,
+ ID_COUNT
+ };
+
+ static long long umask_by_id(int id);
+
+ long long ids[ID_COUNT];
+
+ size_t start_idx; // index of the first Text using this state
+ // for optimization
+ long long hash_value;
+ long long hash_umask; // some states may not be actually used
+ bool need_close;
+
+ static const char * const css_class_names []; // class names for each id
+ };
+
+ struct Offset {
+ Offset(size_t size_idx, double width)
+ :start_idx(size_idx),width(width)
+ { }
+ size_t start_idx; // should put this Offset right before text[start_idx];
+ double width;
+ };
+
+ /**
+ * Append a drawn char (glyph)'s unicode. l > 1 mean this glyph correspond to
+ * multiple code points.
+ */
+ void append_unicodes(const Unicode * u, int l, double width);
+ /**
+ * Append a special padding char with 0 width, in order to keep char index consistent.
+ * The padding char is ignored during output.
+ */
+ void append_padding_char() { text.push_back(0); }
+ void append_offset(double width);
+ void append_state(const HTMLTextState & text_state);
+ void dump_text(std::ostream & out);
+
+ bool text_empty(void) const { return text.empty(); }
+ void clear(void);
+
+ void clip(const HTMLClipState &);
+
+ /*
+ * Optimize and calculate necessary values
+ */
+ void prepare(void);
+ void optimize(std::vector<HTMLTextLine*> &);
+private:
+ void optimize_normal(std::vector<HTMLTextLine*> &);
+ void optimize_aggressive(std::vector<HTMLTextLine*> &);
+
+ /**
+ * Dump chars' unicode to output stream.
+ * begin/pos is the index in 'text'.
+ */
+ void dump_chars(std::ostream & out, int begin, int len);
+ void dump_char(std::ostream & out, int pos);
+
+ const Param & param;
+ AllStateManager & all_manager;
+
+ HTMLLineState line_state;
+ double ascent, descent;
+ double clip_x1, clip_y1;
+ double width;
+
+ std::vector<State> states;
+ std::vector<Offset> offsets;
+
+ /**
+ * Drawn chars (glyph) in this line are stored in 'text'. For each element c in 'text':
+ * - If c > 0, it is the unicode code point corresponds to the glyph;
+ * - If c == 0, it is a padding char, and ignored during output (TODO some bad PDFs utilize 0?);
+ * - If c < -1, this glyph corresponds to more than one unicode code points,
+ * which are stored in 'decomposed_text', and (-c-1) is the index in 'decomposed_text'.
+ */
+ std::vector<int> text;
+ std::vector<std::vector<Unicode> > decomposed_text;
+};
+
+} // namespace pdf2htmlEX
+#endif //HTMLTEXTLINE_H__
diff --git a/src/HTMLTextPage.cc b/src/HTMLTextPage.cc
new file mode 100644
index 0000000..a8e2ab8
--- /dev/null
+++ b/src/HTMLTextPage.cc
@@ -0,0 +1,147 @@
+/*
+ * HTMLTextPage.cc
+ *
+ * Generate and optimized HTML for one Page
+ *
+ * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#include "HTMLTextPage.h"
+#include "util/css_const.h"
+
+namespace pdf2htmlEX {
+
+using std::ostream;
+
+HTMLTextPage::HTMLTextPage(const Param & param, AllStateManager & all_manager)
+ : param(param)
+ , all_manager(all_manager)
+ , cur_line(nullptr)
+ , page_width(0)
+ , page_height(0)
+{ }
+
+HTMLTextPage::~HTMLTextPage()
+{
+ for(auto p : text_lines)
+ delete p;
+}
+
+void HTMLTextPage::dump_text(ostream & out)
+{
+ if(param.optimize_text)
+ {
+ // text lines may be split during optimization, collect them
+ std::vector<HTMLTextLine*> new_text_lines;
+ for(auto p : text_lines)
+ p->optimize(new_text_lines);
+ std::swap(text_lines, new_text_lines);
+ }
+ for(auto p : text_lines)
+ p->prepare();
+ if(param.optimize_text)
+ optimize();
+
+ HTMLClipState page_box;
+ page_box.xmin = page_box.ymin = 0;
+ page_box.xmax = page_width;
+ page_box.ymax = page_height;
+
+ //push a dummy entry for convenience
+ clips.emplace_back(page_box, text_lines.size());
+
+ Clip cur_clip(page_box, 0);
+ bool has_clip = false;
+
+ auto text_line_iter = text_lines.begin();
+ for(auto clip_iter = clips.begin(); clip_iter != clips.end(); ++clip_iter)
+ {
+ auto next_text_line_iter = text_lines.begin() + clip_iter->start_idx;
+ if(text_line_iter != next_text_line_iter)
+ {
+ const auto & cs = cur_clip.clip_state;
+ if(has_clip)
+ {
+ out << "<div class=\"" << CSS::CLIP_CN
+ << " " << CSS::LEFT_CN << all_manager.left.install(cs.xmin)
+ << " " << CSS::BOTTOM_CN << all_manager.bottom.install(cs.ymin)
+ << " " << CSS::WIDTH_CN << all_manager.width.install(cs.xmax - cs.xmin)
+ << " " << CSS::HEIGHT_CN << all_manager.height.install(cs.ymax - cs.ymin)
+ << "\">";
+ }
+
+ while(text_line_iter != next_text_line_iter)
+ {
+ if(has_clip)
+ {
+ (*text_line_iter)->clip(cs);
+ }
+ (*text_line_iter)->dump_text(out);
+ ++text_line_iter;
+ }
+ if(has_clip)
+ {
+ out << "</div>";
+ }
+ }
+
+ {
+ cur_clip = *clip_iter;
+ const auto & cs = cur_clip.clip_state;
+ has_clip = !(equal(0, cs.xmin) && equal(0, cs.ymin)
+ && equal(page_width, cs.xmax) && equal(page_height, cs.ymax));
+ }
+ }
+}
+
+void HTMLTextPage::dump_css(ostream & out)
+{
+ //TODO
+}
+
+void HTMLTextPage::clear(void)
+{
+ text_lines.clear();
+ clips.clear();
+ cur_line = nullptr;
+}
+
+void HTMLTextPage::open_new_line(const HTMLLineState & line_state)
+{
+ // do not reused the last text_line even if it's empty
+ // because the clip states may point to the next index
+ text_lines.emplace_back(new HTMLTextLine(line_state, param, all_manager));
+ cur_line = text_lines.back();
+}
+
+void HTMLTextPage::set_page_size(double width, double height)
+{
+ page_width = width;
+ page_height = height;
+}
+
+void HTMLTextPage::clip(const HTMLClipState & clip_state)
+{
+ if(!clips.empty())
+ {
+ auto & clip = clips.back();
+ if(clip.start_idx == text_lines.size())
+ {
+ /*
+ * Previous ClipBox is not used
+ */
+ clip.clip_state = clip_state;
+ return;
+ }
+ }
+ clips.emplace_back(clip_state, text_lines.size());
+}
+
+void HTMLTextPage::optimize(void)
+{
+ //TODO
+ //group lines with same x-axis
+ //collect common states
+}
+
+} // namespace pdf2htmlEX
diff --git a/src/HTMLTextPage.h b/src/HTMLTextPage.h
new file mode 100644
index 0000000..ccaa564
--- /dev/null
+++ b/src/HTMLTextPage.h
@@ -0,0 +1,66 @@
+/*
+ * Header file for HTMLTextPage
+ * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#ifndef HTMLTEXTPAGE_H__
+#define HTMLTEXTPAGE_H__
+
+#include <vector>
+#include <ostream>
+
+#include "Param.h"
+#include "StateManager.h"
+#include "HTMLTextLine.h"
+#include "HTMLState.h"
+
+namespace pdf2htmlEX {
+
+/*
+ * Store and optimize a page of text in HTML
+ *
+ * contains a series of HTMLTextLine
+ */
+class HTMLTextPage
+{
+public:
+ HTMLTextPage (const Param & param, AllStateManager & all_manager);
+ ~HTMLTextPage();
+
+ HTMLTextLine * get_cur_line(void) const { return cur_line; }
+
+ void dump_text(std::ostream & out);
+ void dump_css(std::ostream & out);
+ void clear(void);
+
+ void open_new_line(const HTMLLineState & line_state);
+
+ /* for clipping */
+ void set_page_size(double width, double height);
+ void clip(const HTMLClipState & clip_state);
+
+ double get_width() { return page_width; }
+ double get_height() { return page_height; }
+
+private:
+ void optimize(void);
+
+ const Param & param;
+ AllStateManager & all_manager;
+ HTMLTextLine * cur_line;
+ double page_width, page_height;
+
+ std::vector<HTMLTextLine*> text_lines;
+
+ struct Clip {
+ HTMLClipState clip_state;
+ size_t start_idx;
+ Clip(const HTMLClipState & clip_state, size_t start_idx)
+ :clip_state(clip_state),start_idx(start_idx)
+ { }
+ };
+ std::vector<Clip> clips;
+};
+
+} //namespace pdf2htmlEX
+#endif //HTMLTEXTPAGE_H__
diff --git a/src/Param.h b/src/Param.h
new file mode 100644
index 0000000..84fa426
--- /dev/null
+++ b/src/Param.h
@@ -0,0 +1,87 @@
+/*
+ * Parameters
+ *
+ * Wang Lu
+ * 2012.08.03
+ */
+
+
+#ifndef PARAM_H__
+#define PARAM_H__
+
+#include <string>
+
+namespace pdf2htmlEX {
+
+struct Param
+{
+ // pages
+ int first_page, last_page;
+
+ // dimensions
+ double zoom;
+ double fit_width, fit_height;
+ int use_cropbox;
+ double h_dpi, v_dpi;
+
+ // output
+ int embed_css;
+ int embed_font;
+ int embed_image;
+ int embed_javascript;
+ int embed_outline;
+ int split_pages;
+ std::string dest_dir;
+ std::string css_filename;
+ std::string page_filename;
+ std::string outline_filename;
+ int process_nontext;
+ int process_outline;
+ int process_annotation;
+ int process_form;
+ int correct_text_visibility;
+ int printing;
+ int fallback;
+ int tmp_file_size_limit;
+
+ // fonts
+ int embed_external_font;
+ std::string font_format;
+ int decompose_ligature;
+ int auto_hint;
+ std::string external_hint_tool;
+ int stretch_narrow_glyph;
+ int squeeze_wide_glyph;
+ int override_fstype;
+ int process_type3;
+
+ // text
+ double h_eps, v_eps;
+ double space_threshold;
+ double font_size_multiplier;
+ int space_as_offset;
+ int tounicode;
+ int optimize_text;
+
+ // background image
+ std::string bg_format;
+ int svg_node_count_limit;
+ int svg_embed_bitmap;
+
+ // encryption
+ std::string owner_password, user_password;
+ int no_drm;
+
+ // misc.
+ int clean_tmp;
+ std::string data_dir;
+ std::string tmp_dir;
+ int debug;
+ int proof;
+
+ std::string input_filename, output_filename;
+};
+
+} // namespace pdf2htmlEX
+
+#endif //PARAM_h__
diff --git a/src/Preprocessor.cc b/src/Preprocessor.cc
new file mode 100644
index 0000000..a8859ad
--- /dev/null
+++ b/src/Preprocessor.cc
@@ -0,0 +1,107 @@
+/*
+ * Preprocessor.cc
+ *
+ * Check used codes for each font
+ *
+ * by WangLu
+ * 2012.09.07
+ */
+
+#include <cstring>
+#include <iostream>
+#include <algorithm>
+
+#include <GfxState.h>
+#include <GfxFont.h>
+
+#include "Preprocessor.h"
+#include "util/misc.h"
+#include "util/const.h"
+
+namespace pdf2htmlEX {
+
+using std::cerr;
+using std::endl;
+using std::flush;
+using std::max;
+
+Preprocessor::Preprocessor(const Param & param)
+ : OutputDev()
+ , param(param)
+ , max_width(0)
+ , max_height(0)
+ , cur_font_id(0)
+ , cur_code_map(nullptr)
+{ }
+
+Preprocessor::~Preprocessor(void)
+{
+ for(auto & p : code_maps)
+ delete [] p.second;
+}
+
+void Preprocessor::process(PDFDoc * doc)
+{
+ int page_count = (param.last_page - param.first_page + 1);
+ for(int i = param.first_page; i <= param.last_page ; ++i)
+ {
+ cerr << "Preprocessing: " << (i-param.first_page) << "/" << page_count << '\r' << flush;
+
+ doc->displayPage(this, i, DEFAULT_DPI, DEFAULT_DPI,
+ 0,
+ (!(param.use_cropbox)),
+ true, // crop
+ false, // printing
+ nullptr, nullptr, nullptr, nullptr);
+ }
+ if(page_count >= 0)
+ cerr << "Preprocessing: " << page_count << "/" << page_count;
+ cerr << endl;
+}
+
+void Preprocessor::drawChar(GfxState *state, double x, double y,
+ double dx, double dy,
+ double originX, double originY,
+ CharCode code, int nBytes, Unicode *u, int uLen)
+{
+ GfxFont * font = state->getFont();
+ if(!font) return;
+
+ long long fn_id = hash_ref(font->getID());
+
+ if(fn_id != cur_font_id)
+ {
+ cur_font_id = fn_id;
+ auto p = code_maps.insert(std::make_pair(cur_font_id, (char*)nullptr));
+ if(p.second)
+ {
+ // this is a new font
+ int len = font->isCIDFont() ? 0x10000 : 0x100;
+ p.first->second = new char [len];
+ memset(p.first->second, 0, len * sizeof(char));
+ }
+
+ cur_code_map = p.first->second;
+ }
+
+ cur_code_map[code] = 1;
+}
+
+void Preprocessor::startPage(int pageNum, GfxState *state)
+{
+ startPage(pageNum, state, nullptr);
+}
+
+void Preprocessor::startPage(int pageNum, GfxState *state, XRef * xref)
+{
+ max_width = max<double>(max_width, state->getPageWidth());
+ max_height = max<double>(max_height, state->getPageHeight());
+}
+
+const char * Preprocessor::get_code_map (long long font_id) const
+{
+ auto iter = code_maps.find(font_id);
+ return (iter == code_maps.end()) ? nullptr : (iter->second);
+}
+
+} // namespace pdf2htmlEX
diff --git a/src/Preprocessor.h b/src/Preprocessor.h
new file mode 100644
index 0000000..5b48e4f
--- /dev/null
+++ b/src/Preprocessor.h
@@ -0,0 +1,66 @@
+/*
+ * Preprocessor.h
+ *
+ * PDF is so complicated that we have to scan twice
+ *
+ * Check used codes for each font
+ * Collect all used link destinations
+ *
+ * by WangLu
+ * 2012.09.07
+ */
+
+
+#ifndef PREPROCESSOR_H__
+#define PREPROCESSOR_H__
+
+#include <unordered_map>
+
+#include <OutputDev.h>
+#include <PDFDoc.h>
+#include <Annot.h>
+#include "Param.h"
+
+namespace pdf2htmlEX {
+
+class Preprocessor : public OutputDev {
+public:
+ Preprocessor(const Param & param);
+ virtual ~Preprocessor(void);
+
+ void process(PDFDoc * doc);
+
+ virtual GBool upsideDown() { return gFalse; }
+ virtual GBool useDrawChar() { return gTrue; }
+ virtual GBool interpretType3Chars() { return gFalse; }
+ virtual GBool needNonText() { return gFalse; }
+ virtual GBool needClipToCropBox() { return gTrue; }
+
+ virtual void drawChar(GfxState *state, double x, double y,
+ double dx, double dy,
+ double originX, double originY,
+ CharCode code, int nBytes, Unicode *u, int uLen);
+
+ // Start a page.
+ // UGLY: These 2 versions are for different versions of poppler
+ virtual void startPage(int pageNum, GfxState *state);
+ virtual void startPage(int pageNum, GfxState *state, XRef * xref);
+
+ const char * get_code_map (long long font_id) const;
+ double get_max_width (void) const { return max_width; }
+ double get_max_height (void) const { return max_height; }
+
+protected:
+ const Param & param;
+
+ double max_width, max_height;
+
+ long long cur_font_id;
+ char * cur_code_map;
+
+ std::unordered_map<long long, char*> code_maps;
+};
+
+} // namespace pdf2htmlEX
+
+#endif //PREPROCESSOR_H__
diff --git a/src/StateManager.h b/src/StateManager.h
new file mode 100644
index 0000000..0a19df0
--- /dev/null
+++ b/src/StateManager.h
@@ -0,0 +1,430 @@
+/*
+ * StateManager.h
+ *
+ * manage reusable CSS classes
+ *
+ * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#ifndef STATEMANAGER_H__
+#define STATEMANAGER_H__
+
+#include <iostream>
+#include <map>
+#include <unordered_map>
+
+#include "Color.h"
+
+#include "util/math.h"
+#include "util/css_const.h"
+
+namespace pdf2htmlEX {
+
+template<class ValueType, class Imp> class StateManager {};
+
+template<class Imp>
+class StateManager<double, Imp>
+{
+public:
+ StateManager()
+ : eps(0)
+ , imp(static_cast<Imp*>(this))
+ { }
+
+ // values no farther than eps are treated as equal
+ void set_eps (double eps) {
+ this->eps = eps;
+ }
+
+ double get_eps (void) const {
+ return eps;
+ }
+
+ // install new_value into the map
+ // return the corresponding id
+ long long install(double new_value, double * actual_value_ptr = nullptr) {
+ auto iter = value_map.lower_bound(new_value - eps);
+ if((iter != value_map.end()) && (std::abs(iter->first - new_value) <= eps))
+ {
+ if(actual_value_ptr != nullptr)
+ *actual_value_ptr = iter->first;
+ return iter->second;
+ }
+
+ long long id = value_map.size();
+ double v = value_map.insert(iter, std::make_pair(new_value, id))->first;
+ if(actual_value_ptr != nullptr)
+ *actual_value_ptr = v;
+ return id;
+ }
+
+ void dump_css(std::ostream & out) {
+ for(auto & p : value_map)
+ {
+ out << "." << imp->get_css_class_name() << p.second << "{";
+ imp->dump_value(out, p.first);
+ out << "}" << std::endl;
+ }
+ }
+
+ void dump_print_css(std::ostream & out, double scale) {
+ for(auto & p : value_map)
+ {
+ out << "." << imp->get_css_class_name() << p.second << "{";
+ imp->dump_print_value(out, p.first, scale);
+ out << "}" << std::endl;
+ }
+ }
+
+protected:
+ double eps;
+ Imp * imp;
+ std::map<double, long long> value_map;
+};
+
+// Be careful about the mixed usage of Matrix and const double *
+// the input is usually double *, which might be changed, so we have to copy the content out
+// in the map we use Matrix instead of double * such that the array may be automatically release when deconstructing
+template <class Imp>
+class StateManager<Matrix, Imp>
+{
+public:
+ StateManager()
+ : imp(static_cast<Imp*>(this))
+ { }
+
+ // return id
+ long long install(const double * new_value) {
+ Matrix m;
+ memcpy(m.m, new_value, sizeof(m.m));
+ auto iter = value_map.lower_bound(m);
+ if((iter != value_map.end()) && (tm_equal(m.m, iter->first.m, 4)))
+ {
+ return iter->second;
+ }
+
+ long long id = value_map.size();
+ value_map.insert(iter, std::make_pair(m, id));
+ return id;
+ }
+
+ void dump_css(std::ostream & out) {
+ for(auto & p : value_map)
+ {
+ out << "." << imp->get_css_class_name() << p.second << "{";
+ imp->dump_value(out, p.first);
+ out << "}" << std::endl;
+ }
+ }
+
+ void dump_print_css(std::ostream & out, double scale) {}
+
+protected:
+ Imp * imp;
+
+ struct Matrix_less
+ {
+ bool operator () (const Matrix & m1, const Matrix & m2) const
+ {
+ // Note that we only care about the first 4 elements
+ for(int i = 0; i < 4; ++i)
+ {
+ if(m1.m[i] < m2.m[i])
+ return true;
+ if(m1.m[i] > m2.m[i])
+ return false;
+ }
+ return false;
+ }
+ };
+
+ std::map<Matrix, long long, Matrix_less> value_map;
+};
+
+template <class Imp>
+class StateManager<Color, Imp>
+{
+public:
+ StateManager()
+ : imp(static_cast<Imp*>(this))
+ { }
+
+ long long install(const Color & new_value) {
+ auto iter = value_map.find(new_value);
+ if(iter != value_map.end())
+ {
+ return iter->second;
+ }
+
+ long long id = value_map.size();
+ value_map.insert(std::make_pair(new_value, id));
+ return id;
+ }
+
+ void dump_css(std::ostream & out) {
+ out << "." << imp->get_css_class_name() << CSS::INVALID_ID << "{";
+ imp->dump_transparent(out);
+ out << "}" << std::endl;
+
+ for(auto & p : value_map)
+ {
+ out << "." << imp->get_css_class_name() << p.second << "{";
+ imp->dump_value(out, p.first);
+ out << "}" << std::endl;
+ }
+ }
+
+ void dump_print_css(std::ostream & out, double scale) {}
+
+protected:
+ Imp * imp;
+
+ struct Color_hash
+ {
+ size_t operator () (const Color & color) const
+ {
+ if(color.transparent)
+ {
+ return (~((size_t)0));
+ }
+ else
+ {
+ return ( ((((size_t)colToByte(color.rgb.r)) & 0xff) << 16)
+ | ((((size_t)colToByte(color.rgb.g)) & 0xff) << 8)
+ | (((size_t)colToByte(color.rgb.b)) & 0xff)
+ );
+ }
+ }
+ };
+
+ std::unordered_map<Color, long long, Color_hash> value_map;
+};
+
+/////////////////////////////////////
+// Specific state managers
+
+class FontSizeManager : public StateManager<double, FontSizeManager>
+{
+public:
+ static const char * get_css_class_name (void) { return CSS::FONT_SIZE_CN; }
+ double default_value(void) { return 0; }
+ void dump_value(std::ostream & out, double value) { out << "font-size:" << round(value) << "px;"; }
+ void dump_print_value(std::ostream & out, double value, double scale) { out << "font-size:" << round(value*scale) << "pt;"; }
+};
+
+class LetterSpaceManager : public StateManager<double, LetterSpaceManager>
+{
+public:
+ static const char * get_css_class_name (void) { return CSS::LETTER_SPACE_CN; }
+ double default_value(void) { return 0; }
+ void dump_value(std::ostream & out, double value) { out << "letter-spacing:" << round(value) << "px;"; }
+ void dump_print_value(std::ostream & out, double value, double scale) { out << "letter-spacing:" << round(value*scale) << "pt;"; }
+};
+
+class WordSpaceManager : public StateManager<double, WordSpaceManager>
+{
+public:
+ static const char * get_css_class_name (void) { return CSS::WORD_SPACE_CN;}
+ double default_value(void) { return 0; }
+ void dump_value(std::ostream & out, double value) { out << "word-spacing:" << round(value) << "px;"; }
+ void dump_print_value(std::ostream & out, double value, double scale) { out << "word-spacing:" << round(value*scale) << "pt;"; }
+};
+
+class VerticalAlignManager : public StateManager<double, VerticalAlignManager>
+{
+public:
+ static const char * get_css_class_name (void) { return CSS::VERTICAL_ALIGN_CN; }
+ double default_value(void) { return 0; }
+ void dump_value(std::ostream & out, double value) { out << "vertical-align:" << round(value) << "px;"; }
+ void dump_print_value(std::ostream & out, double value, double scale) { out << "vertical-align:" << round(value*scale) << "pt;"; }
+};
+
+class WhitespaceManager : public StateManager<double, WhitespaceManager>
+{
+public:
+ static const char * get_css_class_name (void) { return CSS::WHITESPACE_CN; }
+ double default_value(void) { return 0; }
+ void dump_value(std::ostream & out, double value) {
+ out << ((value > 0) ? "width:"
+ : "margin-left:")
+ << round(value) << "px;";
+ }
+ void dump_print_value(std::ostream & out, double value, double scale)
+ {
+ value *= scale;
+ out << ((value > 0) ? "width:"
+ : "margin-left:")
+ << round(value) << "pt;";
+ }
+};
+
+class WidthManager : public StateManager<double, WidthManager>
+{
+public:
+ static const char * get_css_class_name (void) { return CSS::WIDTH_CN; }
+ double default_value(void) { return 0; }
+ void dump_value(std::ostream & out, double value) { out << "width:" << round(value) << "px;"; }
+ void dump_print_value(std::ostream & out, double value, double scale) { out << "width:" << round(value*scale) << "pt;"; }
+};
+
+class BottomManager : public StateManager<double, BottomManager>
+{
+public:
+ static const char * get_css_class_name (void) { return CSS::BOTTOM_CN; }
+ double default_value(void) { return 0; }
+ void dump_value(std::ostream & out, double value) { out << "bottom:" << round(value) << "px;"; }
+ void dump_print_value(std::ostream & out, double value, double scale) { out << "bottom:" << round(value*scale) << "pt;"; }
+};
+
+class HeightManager : public StateManager<double, HeightManager>
+{
+public:
+ static const char * get_css_class_name (void) { return CSS::HEIGHT_CN; }
+ double default_value(void) { return 0; }
+ void dump_value(std::ostream & out, double value) { out << "height:" << round(value) << "px;"; }
+ void dump_print_value(std::ostream & out, double value, double scale) { out << "height:" << round(value*scale) << "pt;"; }
+};
+
+class LeftManager : public StateManager<double, LeftManager>
+{
+public:
+ static const char * get_css_class_name (void) { return CSS::LEFT_CN; }
+ double default_value(void) { return 0; }
+ void dump_value(std::ostream & out, double value) { out << "left:" << round(value) << "px;"; }
+ void dump_print_value(std::ostream & out, double value, double scale) { out << "left:" << round(value*scale) << "pt;"; }
+};
+
+class TransformMatrixManager : public StateManager<Matrix, TransformMatrixManager>
+{
+public:
+ static const char * get_css_class_name (void) { return CSS::TRANSFORM_MATRIX_CN; }
+ const double * default_value(void) { return ID_MATRIX; }
+ void dump_value(std::ostream & out, const Matrix & matrix) {
+ // always ignore tm[4] and tm[5] because
+ // we have already shifted the origin
+ // TODO: recognize common matrices
+ const auto & m = matrix.m;
+ auto prefixes = {"", "-ms-", "-webkit-"};
+ if(tm_equal(m, ID_MATRIX, 4))
+ {
+ for(auto & s : prefixes)
+ out << s << "transform:none;";
+ }
+ else
+ {
+ for(auto & s : prefixes)
+ {
+ // PDF use a different coordinate system from Web
+ out << s << "transform:matrix("
+ << round(m[0]) << ','
+ << round(-m[1]) << ','
+ << round(-m[2]) << ','
+ << round(m[3]) << ',';
+ out << "0,0);";
+ }
+ }
+ }
+};
+
+class FillColorManager : public StateManager<Color, FillColorManager>
+{
+public:
+ static const char * get_css_class_name (void) { return CSS::FILL_COLOR_CN; }
+ /* override base's method, as we need some workaround in CSS */
+ void dump_css(std::ostream & out) {
+ for(auto & p : value_map)
+ {
+ out << "." << get_css_class_name() << p.second
+ << "{color:" << p.first << ";}" << std::endl;
+ }
+ }
+};
+
+class StrokeColorManager : public StateManager<Color, StrokeColorManager>
+{
+public:
+ static const char * get_css_class_name (void) { return CSS::STROKE_COLOR_CN; }
+ /* override base's method, as we need some workaround in CSS */
+ void dump_css(std::ostream & out) {
+ // normal CSS
+ out << "." << get_css_class_name() << CSS::INVALID_ID << "{text-shadow:none;}" << std::endl;
+ for(auto & p : value_map)
+ {
+ // TODO: take the stroke width from the graphics state,
+ // currently using 0.015em as a good default
+ out << "." << get_css_class_name() << p.second << "{text-shadow:"
+ << "-0.015em 0 " << p.first << ","
+ << "0 0.015em " << p.first << ","
+ << "0.015em 0 " << p.first << ","
+ << "0 -0.015em " << p.first << ";"
+ << "}" << std::endl;
+ }
+ // webkit
+ out << CSS::WEBKIT_ONLY << "{" << std::endl;
+ out << "." << get_css_class_name() << CSS::INVALID_ID << "{-webkit-text-stroke:0px transparent;}" << std::endl;
+ for(auto & p : value_map)
+ {
+ out << "." << get_css_class_name() << p.second
+ << "{-webkit-text-stroke:0.015em " << p.first << ";text-shadow:none;}" << std::endl;
+ }
+ out << "}" << std::endl;
+ }
+};
+
+/////////////////////////////////////
+/*
+ * Manage the background image sizes
+ *
+ * We don't merge similar values, since they are bound with PAGE_CONTENT_BOX_number
+ */
+class BGImageSizeManager
+{
+public:
+ void install(int page_no, double width, double height){
+ value_map.insert(std::make_pair(page_no, std::make_pair(width, height)));
+ }
+
+ void dump_css(std::ostream & out) {
+ for(auto & p : value_map)
+ {
+ const auto & s = p.second;
+ out << "." << CSS::PAGE_CONTENT_BOX_CN << p.first << "{";
+ out << "background-size:" << round(s.first) << "px " << round(s.second) << "px;";
+ out << "}" << std::endl;
+ }
+ }
+
+ void dump_print_css(std::ostream & out, double scale) {
+ for(auto & p : value_map)
+ {
+ const auto & s = p.second;
+ out << "." << CSS::PAGE_CONTENT_BOX_CN << p.first << "{";
+ out << "background-size:" << round(s.first * scale) << "pt " << round(s.second * scale) << "pt;";
+ out << "}" << std::endl;
+ }
+ }
+
+private:
+ std::unordered_map<int, std::pair<double,double>> value_map;
+};
+
+struct AllStateManager
+{
+ TransformMatrixManager transform_matrix;
+ VerticalAlignManager vertical_align;
+ StrokeColorManager stroke_color;
+ LetterSpaceManager letter_space;
+ WhitespaceManager whitespace;
+ WordSpaceManager word_space;
+ FillColorManager fill_color;
+ FontSizeManager font_size;
+ BottomManager bottom;
+ HeightManager height;
+ WidthManager width;
+ LeftManager left;
+ BGImageSizeManager bgimage_size;
+};
+
+} // namespace pdf2htmlEX
+
+#endif //STATEMANAGER_H__
diff --git a/src/StringFormatter.cc b/src/StringFormatter.cc
new file mode 100644
index 0000000..b361c2d
--- /dev/null
+++ b/src/StringFormatter.cc
@@ -0,0 +1,30 @@
+#include <cstdarg>
+#include <algorithm>
+#include <cassert>
+
+#include "StringFormatter.h"
+
+namespace pdf2htmlEX {
+
+StringFormatter::GuardedPointer StringFormatter::operator () (const char * format, ...)
+{
+ assert((buf_cnt == 0) && "StringFormatter: buffer is reused!");
+
+ va_list vlist;
+ va_start(vlist, format);
+ int l = vsnprintf(&buf.front(), buf.capacity(), format, vlist);
+ va_end(vlist);
+ if(l >= (int)buf.capacity())
+ {
+ buf.reserve(std::max<long>((long)(l+1), (long)buf.capacity() * 2));
+ va_start(vlist, format);
+ l = vsnprintf(&buf.front(), buf.capacity(), format, vlist);
+ va_end(vlist);
+ }
+ assert(l >= 0); // we should fail when vsnprintf fail
+ assert(l < (int)buf.capacity());
+ return GuardedPointer(this);
+}
+
+} //namespace pdf2htmlEX
+
diff --git a/src/StringFormatter.h b/src/StringFormatter.h
new file mode 100644
index 0000000..dd3f3c1
--- /dev/null
+++ b/src/StringFormatter.h
@@ -0,0 +1,43 @@
+/*
+ * Buffer reusing string formatter
+ *
+ * by WangLu
+ * 2012.11.29
+ */
+
+#ifndef STRINGFORMATTER_H__
+#define STRINGFORMATTER_H__
+
+#include <vector>
+#include <cstdio>
+
+namespace pdf2htmlEX {
+
+class StringFormatter
+{
+public:
+ struct GuardedPointer
+ {
+ GuardedPointer(StringFormatter * sf) : sf(sf) { ++(sf->buf_cnt); }
+ GuardedPointer(const GuardedPointer & gp) : sf(gp.sf) { ++(sf->buf_cnt); }
+ ~GuardedPointer(void) { --(sf->buf_cnt); }
+ operator char* () const { return &(sf->buf.front()); }
+ private:
+ StringFormatter * sf;
+ };
+
+ StringFormatter() : buf_cnt(0) { buf.reserve(L_tmpnam); }
+ /*
+ * Important:
+ * there is only one buffer, so new strings will replace old ones
+ */
+ GuardedPointer operator () (const char * format, ...);
+
+private:
+ friend class GuardedPointer;
+ std::vector<char> buf;
+ int buf_cnt;
+};
+
+} //namespace pdf2htmlEX
+#endif //STRINGFORMATTER_H__
diff --git a/src/TmpFiles.cc b/src/TmpFiles.cc
new file mode 100644
index 0000000..1184548
--- /dev/null
+++ b/src/TmpFiles.cc
@@ -0,0 +1,77 @@
+/*
+ * TmpFiles.cc
+ *
+ * Collect and clean-up temporary files
+ *
+ * implemented by WangLu
+ * split off by Filodej <philodej@gmail.com>
+ */
+
+#include <iostream>
+#include <cstdio>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "TmpFiles.h"
+#include "Param.h"
+
+#ifdef __MINGW32__
+#include "util/mingw.h"
+#endif
+
+using namespace std;
+
+namespace pdf2htmlEX {
+
+TmpFiles::TmpFiles( const Param& param )
+ : param( param )
+{ }
+
+TmpFiles::~TmpFiles()
+{
+ clean();
+}
+
+void TmpFiles::add( const string & fn)
+{
+ if(!param.clean_tmp)
+ return;
+
+ if(tmp_files.insert(fn).second && param.debug)
+ cerr << "Add new temporary file: " << fn << endl;
+}
+
+// Return the total size of the temporary files in bytes
+double TmpFiles::get_total_size() const
+{
+ double total_size = 0;
+ struct stat st;
+ for(auto & fn : tmp_files)
+ {
+ stat(fn.c_str(), &st);
+ total_size += st.st_size;
+ }
+
+ return total_size;
+}
+
+
+void TmpFiles::clean()
+{
+ if(!param.clean_tmp)
+ return;
+
+ for(auto & fn : tmp_files)
+ {
+ remove(fn.c_str());
+ if(param.debug)
+ cerr << "Remove temporary file: " << fn << endl;
+ }
+
+ rmdir(param.tmp_dir.c_str());
+ if(param.debug)
+ cerr << "Remove temporary directory: " << param.tmp_dir << endl;
+}
+
+} // namespace pdf2htmlEX
+
diff --git a/src/TmpFiles.h b/src/TmpFiles.h
new file mode 100644
index 0000000..277281d
--- /dev/null
+++ b/src/TmpFiles.h
@@ -0,0 +1,28 @@
+#ifndef TMPFILES_H__
+#define TMPFILES_H__
+
+#include <string>
+#include <set>
+#include "Param.h"
+
+namespace pdf2htmlEX {
+
+class TmpFiles
+{
+public:
+ explicit TmpFiles( const Param& param );
+ ~TmpFiles();
+
+ void add( const std::string& fn);
+ double get_total_size() const;
+
+private:
+ void clean();
+
+ const Param& param;
+ std::set<std::string> tmp_files;
+};
+
+} // namespace pdf2htmlEX
+
+#endif //TMPFILES_H__
diff --git a/src/css_class_names.cmakelists.txt b/src/css_class_names.cmakelists.txt
new file mode 100644
index 0000000..067d95a
--- /dev/null
+++ b/src/css_class_names.cmakelists.txt
@@ -0,0 +1,39 @@
+# vim: filetype=cmake :
+# CSS class names
+
+# Note
+# don't use: (otherwise conflicted with others when there is an ID suffix)
+# p f s
+
+set(CSS_INVALID_ID "_")
+
+set(CSS_LINE_CN "t") # Text
+set(CSS_TRANSFORM_MATRIX_CN "m") # Matrix
+set(CSS_CLIP_CN "c") # Clip
+
+set(CSS_PAGE_FRAME_CN "pf") # Page Frame
+set(CSS_PAGE_CONTENT_BOX_CN "pc") # Page Content
+set(CSS_PAGE_DATA_CN "pi") # Page Info
+
+set(CSS_BACKGROUND_IMAGE_CN "bi") # Background Image
+set(CSS_FULL_BACKGROUND_IMAGE_CN "bf") # Background image (Full)
+
+set(CSS_FONT_FAMILY_CN "ff") # Font Family
+set(CSS_FONT_SIZE_CN "fs") # Font Size
+
+set(CSS_FILL_COLOR_CN "fc") # Fill Color
+set(CSS_STROKE_COLOR_CN "sc") # Stroke Color
+
+set(CSS_LETTER_SPACE_CN "ls") # Letter Space
+set(CSS_WORD_SPACE_CN "ws") # Word Space
+set(CSS_VERTICAL_ALIGN_CN "v") # Vertical align
+set(CSS_WHITESPACE_CN "_") # whitespace
+set(CSS_LEFT_CN "x") # X
+set(CSS_HEIGHT_CN "h") # Height
+set(CSS_WIDTH_CN "w") # Width
+set(CSS_BOTTTOM_CN "y") # Y
+set(CSS_CSS_DRAW_CN "d") # Draw
+set(CSS_LINK_CN "l") # Link
+set(CSS_INPUT_TEXT_CN "it") # Text input
+set(CSS_INPUT_RADIO_CN "ir") # Radio button
+set(CSS_RADIO_CHECKED_CN "checked") # Show picture of checked out radio button
diff --git a/src/pdf2htmlEX-config.h.in b/src/pdf2htmlEX-config.h.in
new file mode 100644
index 0000000..7c9b510
--- /dev/null
+++ b/src/pdf2htmlEX-config.h.in
@@ -0,0 +1,24 @@
+/*
+ * config.h
+ * Compile time constants
+ *
+ * Copyright (C) 2012-2014 Lu Wang <coolwanglu@gmail.com>
+ */
+
+
+#ifndef PDF2HTMLEX_CONFIG_H__
+#define PDF2HTMLEX_CONFIG_H__
+
+#include <string>
+
+#define ENABLE_SVG @ENABLE_SVG@
+
+namespace pdf2htmlEX {
+
+static const std::string PDF2HTMLEX_VERSION = "@PDF2HTMLEX_VERSION@";
+static const std::string PDF2HTMLEX_PREFIX = "@CMAKE_INSTALL_PREFIX@";
+static const std::string PDF2HTMLEX_DATA_PATH = "@CMAKE_INSTALL_PREFIX@""/share/pdf2htmlEX";
+
+} // namespace pdf2htmlEX
+
+#endif //PDF2HTMLEX_CONFIG_H__
diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc
new file mode 100644
index 0000000..b56e8e9
--- /dev/null
+++ b/src/pdf2htmlEX.cc
@@ -0,0 +1,445 @@
+// pdf2htmlEX.cc
+//
+// Copyright (C) 2012-2015 Lu Wang <coolwanglu@gmail.com>
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstddef>
+#include <cstring>
+#include <ctime>
+#include <string>
+#include <limits>
+#include <iostream>
+#include <memory>
+#include <errno.h>
+
+#include <getopt.h>
+
+#include <poppler-config.h>
+#include <goo/GooString.h>
+
+#include <Object.h>
+#include <PDFDoc.h>
+#include <PDFDocFactory.h>
+#include <GlobalParams.h>
+
+#include "pdf2htmlEX-config.h"
+
+#if ENABLE_SVG
+#include <cairo.h>
+#endif
+
+#include "ArgParser.h"
+#include "Param.h"
+#include "HTMLRenderer/HTMLRenderer.h"
+
+#include "util/path.h"
+#include "util/ffw.h"
+
+#ifdef __MINGW32__
+#include "util/mingw.h"
+#endif
+
+using namespace std;
+using namespace pdf2htmlEX;
+
+Param param;
+ArgParser argparser;
+
+void show_usage_and_exit(const char * dummy = nullptr)
+{
+ cerr << "Usage: pdf2htmlEX [options] <input.pdf> [<output.html>]" << endl;
+ argparser.show_usage(cerr);
+ exit(EXIT_FAILURE);
+}
+
+void show_version_and_exit(const char * dummy = nullptr)
+{
+ cerr << "pdf2htmlEX version " << PDF2HTMLEX_VERSION << endl;
+ cerr << "Copyright 2012-2015 Lu Wang <coolwanglu@gmail.com> and other contributors" << endl;
+ cerr << "Libraries: " << endl;
+ cerr << " poppler " << POPPLER_VERSION << endl;
+ cerr << " libfontforge " << ffw_get_version() << endl;
+#if ENABLE_SVG
+ cerr << " cairo " << cairo_version_string() << endl;
+#endif
+ cerr << "Default data-dir: " << param.data_dir << endl;
+ cerr << "Supported image format:";
+#ifdef ENABLE_LIBPNG
+ cerr << " png";
+#endif
+#ifdef ENABLE_LIBJPEG
+ cerr << " jpg";
+#endif
+#if ENABLE_SVG
+ cerr << " svg";
+#endif
+ cerr << endl;
+
+ cerr << endl;
+ exit(EXIT_SUCCESS);
+}
+
+void embed_parser (const char * str)
+{
+ while(true)
+ {
+ switch(*str)
+ {
+ case '\0': return; break;
+ case 'c': param.embed_css = 0; break;
+ case 'C': param.embed_css = 1; break;
+ case 'f': param.embed_font = 0; break;
+ case 'F': param.embed_font = 1; break;
+ case 'i': param.embed_image = 0; break;
+ case 'I': param.embed_image = 1; break;
+ case 'j': param.embed_javascript = 0; break;
+ case 'J': param.embed_javascript = 1; break;
+ case 'o': param.embed_outline = 0; break;
+ case 'O': param.embed_outline = 1; break;
+ default:
+ cerr << "Unknown character `" << (*str) << "` for --embed" << endl;
+ break;
+ }
+ ++ str;
+ }
+}
+
+void prepare_directories()
+{
+ std::string tmp_dir = param.tmp_dir + "/pdf2htmlEX-XXXXXX";
+
+ errno = 0;
+
+ unique_ptr<char> pBuf(new char[tmp_dir.size() + 1]);
+ strcpy(pBuf.get(), tmp_dir.c_str());
+ auto p = mkdtemp(pBuf.get());
+ if(p == nullptr)
+ {
+ const char * errmsg = strerror(errno);
+ if(!errmsg)
+ {
+ errmsg = "unknown error";
+ }
+ cerr << "Cannot create temp directory: " << errmsg << endl;
+ exit(EXIT_FAILURE);
+ }
+ param.tmp_dir = pBuf.get();
+}
+
+void parse_options (int argc, char **argv)
+{
+ argparser
+ // pages
+ .add("first-page,f", &param.first_page, 1, "first page to convert")
+ .add("last-page,l", &param.last_page, numeric_limits<int>::max(), "last page to convert")
+
+ // dimensions
+ .add("zoom", &param.zoom, 0, "zoom ratio", true)
+ .add("fit-width", &param.fit_width, 0, "fit width to <fp> pixels", true)
+ .add("fit-height", &param.fit_height, 0, "fit height to <fp> pixels", true)
+ .add("use-cropbox", &param.use_cropbox, 1, "use CropBox instead of MediaBox")
+ .add("hdpi", &param.h_dpi, 144.0, "horizontal resolution for graphics in DPI")
+ .add("vdpi", &param.v_dpi, 144.0, "vertical resolution for graphics in DPI")
+
+ // output files
+ .add("embed", "specify which elements should be embedded into output", embed_parser, true)
+ .add("embed-css", &param.embed_css, 1, "embed CSS files into output")
+ .add("embed-font", &param.embed_font, 1, "embed font files into output")
+ .add("embed-image", &param.embed_image, 1, "embed image files into output")
+ .add("embed-javascript", &param.embed_javascript, 1, "embed JavaScript files into output")
+ .add("embed-outline", &param.embed_outline, 1, "embed outlines into output")
+ .add("split-pages", &param.split_pages, 0, "split pages into separate files")
+ .add("dest-dir", &param.dest_dir, ".", "specify destination directory")
+ .add("css-filename", &param.css_filename, "", "filename of the generated css file")
+ .add("page-filename", &param.page_filename, "", "filename template for split pages ")
+ .add("outline-filename", &param.outline_filename, "", "filename of the generated outline file")
+ .add("process-nontext", &param.process_nontext, 1, "render graphics in addition to text")
+ .add("process-outline", &param.process_outline, 1, "show outline in HTML")
+ .add("process-annotation", &param.process_annotation, 0, "show annotation in HTML")
+ .add("process-form", &param.process_form, 0, "include text fields and radio buttons")
+ .add("printing", &param.printing, 1, "enable printing support")
+ .add("fallback", &param.fallback, 0, "output in fallback mode")
+ .add("tmp-file-size-limit", &param.tmp_file_size_limit, -1, "Maximum size (in KB) used by temporary files, -1 for no limit.")
+
+ // fonts
+ .add("embed-external-font", &param.embed_external_font, 1, "embed local match for external fonts")
+ .add("font-format", &param.font_format, "woff", "suffix for embedded font files (ttf,otf,woff,svg)")
+ .add("decompose-ligature", &param.decompose_ligature, 0, "decompose ligatures, such as \uFB01 -> fi")
+ .add("auto-hint", &param.auto_hint, 0, "use fontforge autohint on fonts without hints")
+ .add("external-hint-tool", &param.external_hint_tool, "", "external tool for hinting fonts (overrides --auto-hint)")
+ .add("stretch-narrow-glyph", &param.stretch_narrow_glyph, 0, "stretch narrow glyphs instead of padding them")
+ .add("squeeze-wide-glyph", &param.squeeze_wide_glyph, 1, "shrink wide glyphs instead of truncating them")
+ .add("override-fstype", &param.override_fstype, 0, "clear the fstype bits in TTF/OTF fonts")
+ .add("process-type3", &param.process_type3, 0, "convert Type 3 fonts for web (experimental)")
+
+ // text
+ .add("heps", &param.h_eps, 1.0, "horizontal threshold for merging text, in pixels")
+ .add("veps", &param.v_eps, 1.0, "vertical threshold for merging text, in pixels")
+ .add("space-threshold", &param.space_threshold, (1.0/8), "word break threshold (threshold * em)")
+ .add("font-size-multiplier", &param.font_size_multiplier, 4.0, "a value greater than 1 increases the rendering accuracy")
+ .add("space-as-offset", &param.space_as_offset, 0, "treat space characters as offsets")
+ .add("tounicode", &param.tounicode, 0, "how to handle ToUnicode CMaps (0=auto, 1=force, -1=ignore)")
+ .add("optimize-text", &param.optimize_text, 0, "try to reduce the number of HTML elements used for text")
+ .add("correct-text-visibility", &param.correct_text_visibility, 0, "try to detect texts covered by other graphics and properly arrange them")
+
+ // background image
+ .add("bg-format", &param.bg_format, "png", "specify background image format")
+ .add("svg-node-count-limit", &param.svg_node_count_limit, -1, "if node count in a svg background image exceeds this limit,"
+ " fall back this page to bitmap background; negative value means no limit.")
+ .add("svg-embed-bitmap", &param.svg_embed_bitmap, 1, "1: embed bitmaps in svg background; 0: dump bitmaps to external files if possible.")
+
+ // encryption
+ .add("owner-password,o", &param.owner_password, "", "owner password (for encrypted files)", true)
+ .add("user-password,u", &param.user_password, "", "user password (for encrypted files)", true)
+ .add("no-drm", &param.no_drm, 0, "override document DRM settings")
+
+ // misc.
+ .add("clean-tmp", &param.clean_tmp, 1, "remove temporary files after conversion")
+ .add("tmp-dir", &param.tmp_dir, param.tmp_dir, "specify the location of temporary directory.")
+ .add("data-dir", &param.data_dir, param.data_dir, "specify data directory")
+ .add("debug", &param.debug, 0, "print debugging information")
+ .add("proof", &param.proof, 0, "texts are drawn on both text layer and background for proof.")
+
+ // meta
+ .add("version,v", "print copyright and version info", &show_version_and_exit)
+ .add("help,h", "print usage information", &show_usage_and_exit)
+
+ .add("", &param.input_filename, "", "")
+ .add("", &param.output_filename, "", "")
+ ;
+
+ try
+ {
+ argparser.parse(argc, argv);
+ }
+ catch(const char * s)
+ {
+ // if s == "", getopt_long would have printed the error message
+ if(s && s[0])
+ {
+ cerr << "Error when parsing the arguments:" << endl;
+ cerr << s << endl;
+ }
+ exit(EXIT_FAILURE);
+ }
+ catch(const std::string & s)
+ {
+ // if s == "", getopt_long would have printed the error message
+ if(s != "")
+ {
+ cerr << "Error when parsing the arguments:" << endl;
+ cerr << s << endl;
+ }
+ exit(EXIT_FAILURE);
+ }
+}
+
+void check_param()
+{
+ if (param.input_filename == "")
+ {
+ show_usage_and_exit();
+ }
+
+ if(param.output_filename.empty())
+ {
+ const string s = get_filename(param.input_filename);
+ if(get_suffix(param.input_filename) == ".pdf")
+ {
+ param.output_filename = s.substr(0, s.size() - 4) + ".html";
+ }
+ else
+ {
+ param.output_filename = s + ".html";
+ }
+ }
+
+ if(param.page_filename.empty())
+ {
+ const string s = get_filename(param.input_filename);
+ if(get_suffix(param.input_filename) == ".pdf")
+ {
+ param.page_filename = s.substr(0, s.size() - 4) + "%d.page";
+ }
+ else
+ {
+ param.page_filename = s + "%d.page";
+ }
+ sanitize_filename(param.page_filename);
+ }
+
+ else
+ {
+ // Need to make sure we have a page number placeholder in the filename
+ if(!sanitize_filename(param.page_filename))
+ {
+ // Inject the placeholder just before the file extension
+ const string suffix = get_suffix(param.page_filename);
+ param.page_filename = param.page_filename.substr(0, param.page_filename.size() - suffix.size()) + "%d" + suffix;
+ sanitize_filename(param.page_filename);
+ }
+ }
+ if(param.css_filename.empty())
+ {
+ const string s = get_filename(param.input_filename);
+
+ if(get_suffix(param.input_filename) == ".pdf")
+ {
+ param.css_filename = s.substr(0, s.size() - 4) + ".css";
+ }
+ else
+ {
+ param.css_filename = s + ".css";
+ }
+ }
+ if(param.outline_filename.empty())
+ {
+ const string s = get_filename(param.input_filename);
+
+ if(get_suffix(param.input_filename) == ".pdf")
+ {
+ param.outline_filename = s.substr(0, s.size() - 4) + ".outline";
+ }
+ else
+ {
+ if(!param.split_pages)
+ param.outline_filename = s + ".outline";
+ }
+ }
+
+ if(false) { }
+#ifdef ENABLE_LIBPNG
+ else if (param.bg_format == "png") { }
+#endif
+#ifdef ENABLE_LIBJPEG
+ else if (param.bg_format == "jpg") { }
+#endif
+#if ENABLE_SVG
+ else if(param.bg_format == "svg") { }
+#endif
+ else
+ {
+ cerr << "Image format not supported: " << param.bg_format << endl;
+ exit(EXIT_FAILURE);
+ }
+
+#if not ENABLE_SVG
+ if(param.process_type3)
+ {
+ cerr << "process-type3 is enabled, however SVG support is not built in this version of pdf2htmlEX." << endl;
+ exit(EXIT_FAILURE);
+ }
+#endif
+
+ if((param.font_format == "ttf") && (param.external_hint_tool == ""))
+ {
+ cerr << "Warning: No hint tool is specified for truetype fonts, the result may be rendered poorly in some circumstances." << endl;
+ }
+
+ if (param.embed_image && (param.bg_format == "svg") && !param.svg_embed_bitmap)
+ {
+ cerr << "Warning: --svg-embed-bitmap is forced on because --embed-image is on, or the dumped bitmaps can't be loaded." << endl;
+ param.svg_embed_bitmap = 1;
+ }
+}
+
+int main(int argc, char **argv)
+{
+ // We need to adjust these directories before parsing the options.
+#if defined(__MINGW32__)
+ param.data_dir = get_exec_dir(argv[0]);
+ param.tmp_dir = get_tmp_dir();
+#else
+ char const* tmp = getenv("TMPDIR");
+#ifdef P_tmpdir
+ if (!tmp)
+ tmp = P_tmpdir;
+#endif
+#ifdef _PATH_TMP
+ if (!tmp)
+ tmp = _PATH_TMP;
+#endif
+ if (!tmp)
+ tmp = "/tmp";
+ param.tmp_dir = string(tmp);
+ param.data_dir = PDF2HTMLEX_DATA_PATH;
+#endif
+
+ parse_options(argc, argv);
+ check_param();
+
+ //prepare the directories
+ prepare_directories();
+
+ if(param.debug)
+ cerr << "temporary dir: " << (param.tmp_dir) << endl;
+
+ try
+ {
+ create_directories(param.dest_dir);
+ }
+ catch (const string & s)
+ {
+ cerr << s << endl;
+ exit(EXIT_FAILURE);
+ }
+
+ bool finished = false;
+ // read config file
+ globalParams = new GlobalParams();
+ // open PDF file
+ PDFDoc * doc = nullptr;
+ try
+ {
+ {
+ GooString * ownerPW = (param.owner_password == "") ? (nullptr) : (new GooString(param.owner_password.c_str()));
+ GooString * userPW = (param.user_password == "") ? (nullptr) : (new GooString(param.user_password.c_str()));
+ GooString fileName(param.input_filename.c_str());
+
+ doc = PDFDocFactory().createPDFDoc(fileName, ownerPW, userPW);
+
+ delete userPW;
+ delete ownerPW;
+ }
+
+ if (!doc->isOk())
+ throw "Cannot read the file";
+
+ // check for copy permission
+ if (!doc->okToCopy())
+ {
+ if (param.no_drm == 0)
+ throw "Copying of text from this document is not allowed.";
+ cerr << "Document has copy-protection bit set." << endl;
+ }
+
+ param.first_page = min<int>(max<int>(param.first_page, 1), doc->getNumPages());
+ param.last_page = min<int>(max<int>(param.last_page, param.first_page), doc->getNumPages());
+
+
+ unique_ptr<HTMLRenderer>(new HTMLRenderer(param))->process(doc);
+
+ finished = true;
+ }
+ catch (const char * s)
+ {
+ cerr << "Error: " << s << endl;
+ }
+ catch (const string & s)
+ {
+ cerr << "Error: " << s << endl;
+ }
+
+ // clean up
+ delete doc;
+ delete globalParams;
+
+ // check for memory leaks
+ Object::memCheck(stderr);
+ gMemReport(stderr);
+
+ exit(finished ? (EXIT_SUCCESS) : (EXIT_FAILURE));
+
+ return 0;
+}
diff --git a/src/util/const.cc b/src/util/const.cc
new file mode 100644
index 0000000..c85e0d5
--- /dev/null
+++ b/src/util/const.cc
@@ -0,0 +1,53 @@
+/*
+ * Constants
+ *
+ * by WangLu
+ * 2012.11.29
+ */
+
+#include "const.h"
+
+namespace pdf2htmlEX {
+
+using std::map;
+using std::string;
+
+const double ID_MATRIX[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0};
+
+const map<string, string> GB_ENCODED_FONT_NAME_MAP({
+ {"\xCB\xCE\xCC\xE5", "SimSun"},
+ {"\xBA\xDA\xCC\xE5", "SimHei"},
+ {"\xBF\xAC\xCC\xE5_GB2312", "SimKai"},
+ {"\xB7\xC2\xCB\xCE_GB2312", "SimFang"},
+ {"\xC1\xA5\xCA\xE9", "SimLi"},
+});
+
+const std::map<std::string, EmbedStringEntry> EMBED_STRING_MAP({
+ {".css", {&Param::embed_css,
+ "<style type=\"text/css\">",
+ "</style>", false,
+ "<link rel=\"stylesheet\" href=\"",
+ "\"/>" }},
+ {".js", {&Param::embed_javascript,
+ "<script>",
+ "</script>", false,
+ "<script src=\"",
+ "\"></script>" }},
+ {".png", {&Param::embed_image,
+ "<img alt=\"\" src=\"data:image/png;base64,",
+ "\"/>", true,
+ "<img alt=\"\" src=\"",
+ "\"/>" }}
+});
+
+const std::map<std::string, std::string> FORMAT_MIME_TYPE_MAP({
+ {"eot", "application/vnd.ms-fontobject"},
+ {"jpg", "image/jpeg"},
+ {"otf", "application/x-font-otf"},
+ {"png", "image/png"},
+ {"svg", "image/svg+xml"},
+ {"ttf", "application/x-font-ttf"},
+ {"woff", "application/font-woff"},
+});
+
+} //namespace pdf2htmlEX
diff --git a/src/util/const.h b/src/util/const.h
new file mode 100644
index 0000000..db29a5c
--- /dev/null
+++ b/src/util/const.h
@@ -0,0 +1,46 @@
+/*
+ * Constants
+ *
+ * by WangLu
+ * 2012.11.29
+ */
+
+#ifndef CONST_H__
+#define CONST_H__
+
+#include <map>
+#include <string>
+
+#include "Param.h"
+
+namespace pdf2htmlEX {
+
+#ifndef nullptr
+#define nullptr (NULL)
+#endif
+
+static const double EPS = 1e-6;
+static const double DEFAULT_DPI = 72.0;
+extern const double ID_MATRIX[6];
+
+// For GB encoded font names
+extern const std::map<std::string, std::string> GB_ENCODED_FONT_NAME_MAP;
+// map to embed files into html
+struct EmbedStringEntry
+{
+ int Param::*embed_flag;
+ // used when *embed_flag == true
+ std::string prefix_embed;
+ std::string suffix_embed;
+ bool base64_encode;
+ // used when *embed_flag == false
+ std::string prefix_external;
+ std::string suffix_external;
+};
+extern const std::map<std::string, EmbedStringEntry> EMBED_STRING_MAP;
+
+extern const std::map<std::string, std::string> FORMAT_MIME_TYPE_MAP;
+
+} // namespace pdf2htmlEX
+
+#endif //CONST_H__
diff --git a/src/util/css_const.h.in b/src/util/css_const.h.in
new file mode 100644
index 0000000..08c23fc
--- /dev/null
+++ b/src/util/css_const.h.in
@@ -0,0 +1,67 @@
+/* vim: set filetype=cpp : */
+/*
+ * css_const.h
+ *
+ * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#ifndef CSS_CONST_H__
+#define CSS_CONST_H__
+
+
+/*
+ * should be consistent with base.css and pdf2htmlEX.js
+ */
+
+namespace pdf2htmlEX {
+namespace CSS {
+
+// work around strings
+const char * const WEBKIT_ONLY = "@media screen and (-webkit-min-device-pixel-ratio:0)";
+const char * const PRINT_ONLY = "@media print";
+
+// usually the class name is XXX_CN or XXX_CN<hex id>
+// sometimes we need a special one, e.g. transparent color, where the id is -1
+const char * const INVALID_ID = "@CSS_INVALID_ID@";
+
+const char * const LINE_CN = "@CSS_LINE_CN@";
+const char * const TRANSFORM_MATRIX_CN = "@CSS_TRANSFORM_MATRIX_CN@";
+const char * const CLIP_CN = "@CSS_CLIP_CN@";
+
+// page_decoration is for shadow etc
+// page_frame cannot have margin or border-width, pdf2htmlEX.js will use it to determine the coordinates
+// page_content holds everything inside the page, could be hidden to speed up rendering
+// page_data holds data for pdf2htmlEX.js
+const char * const PAGE_DECORATION_CN = "@CSS_PAGE_DECORATION_CN@";
+const char * const PAGE_FRAME_CN = "@CSS_PAGE_FRAME_CN@";
+const char * const PAGE_CONTENT_BOX_CN = "@CSS_PAGE_CONTENT_BOX_CN@";
+const char * const PAGE_DATA_CN = "@CSS_PAGE_DATA_CN@";
+
+const char * const BACKGROUND_IMAGE_CN = "@CSS_BACKGROUND_IMAGE_CN@";
+const char * const FULL_BACKGROUND_IMAGE_CN = "@CSS_FULL_BACKGROUND_IMAGE_CN@";
+
+const char * const FONT_FAMILY_CN = "@CSS_FONT_FAMILY_CN@";
+const char * const FONT_SIZE_CN = "@CSS_FONT_SIZE_CN@";
+const char * const FILL_COLOR_CN = "@CSS_FILL_COLOR_CN@";
+const char * const STROKE_COLOR_CN = "@CSS_STROKE_COLOR_CN@";
+const char * const LETTER_SPACE_CN = "@CSS_LETTER_SPACE_CN@";
+const char * const WORD_SPACE_CN = "@CSS_WORD_SPACE_CN@";
+const char * const VERTICAL_ALIGN_CN = "@CSS_VERTICAL_ALIGN_CN@";
+const char * const WHITESPACE_CN = "@CSS_WHITESPACE_CN@";
+const char * const LEFT_CN = "@CSS_LEFT_CN@";
+const char * const HEIGHT_CN = "@CSS_HEIGHT_CN@";
+const char * const WIDTH_CN = "@CSS_WIDTH_CN@";
+const char * const BOTTOM_CN = "@CSS_BOTTTOM_CN@";
+
+const char * const CSS_DRAW_CN = "@CSS_CSS_DRAW_CN@";
+const char * const LINK_CN = "@CSS_LINK_CN@";
+
+const char * const INPUT_TEXT_CN = "@CSS_INPUT_TEXT_CN@";
+const char * const INPUT_RADIO_CN = "@CSS_INPUT_RADIO_CN@";
+const char * const RADIO_CHECKED_CN = "@CSS_RADIO_CHECKED_CN@";
+
+}
+}
+
+
+#endif //CSS_CONST_H__
diff --git a/src/util/encoding.cc b/src/util/encoding.cc
new file mode 100644
index 0000000..6b600bc
--- /dev/null
+++ b/src/util/encoding.cc
@@ -0,0 +1,182 @@
+/*
+ * Encodings used in HTML
+ *
+ * by WangLu
+ * 2013.02.15
+ */
+
+#include <cstring>
+
+#include "encoding.h"
+#include "const.h" // for nullptr
+
+namespace pdf2htmlEX {
+
+using std::ostream;
+using std::string;
+
+/*
+ * Copied from UTF.h / UTF8.h in poppler
+ */
+static int mapUTF8(Unicode u, char *buf, int bufSize)
+{
+ if (u <= 0x0000007f) {
+ if (bufSize < 1) {
+ return 0;
+ }
+ buf[0] = (char)u;
+ return 1;
+ } else if (u <= 0x000007ff) {
+ if (bufSize < 2) {
+ return 0;
+ }
+ buf[0] = (char)(0xc0 + (u >> 6));
+ buf[1] = (char)(0x80 + (u & 0x3f));
+ return 2;
+ } else if (u <= 0x0000ffff) {
+ if (bufSize < 3) {
+ return 0;
+ }
+ buf[0] = (char)(0xe0 + (u >> 12));
+ buf[1] = (char)(0x80 + ((u >> 6) & 0x3f));
+ buf[2] = (char)(0x80 + (u & 0x3f));
+ return 3;
+ } else if (u <= 0x0010ffff) {
+ if (bufSize < 4) {
+ return 0;
+ }
+ buf[0] = (char)(0xf0 + (u >> 18));
+ buf[1] = (char)(0x80 + ((u >> 12) & 0x3f));
+ buf[2] = (char)(0x80 + ((u >> 6) & 0x3f));
+ buf[3] = (char)(0x80 + (u & 0x3f));
+ return 4;
+ } else {
+ return 0;
+ }
+}
+
+void writeUnicodes(ostream & out, const Unicode * u, int uLen)
+{
+ for(int i = 0; i < uLen; ++i)
+ {
+ switch(u[i])
+ {
+ case '&':
+ out << "&amp;";
+ break;
+ case '\"':
+ out << "&quot;";
+ break;
+ case '\'':
+ out << "&apos;";
+ break;
+ case '<':
+ out << "&lt;";
+ break;
+ case '>':
+ out << "&gt;";
+ break;
+ default:
+ {
+ char buf[4];
+ auto n = mapUTF8(u[i], buf, 4);
+ out.write(buf, n);
+ }
+ }
+ }
+}
+
+/*
+static void writeHEX(ostream & out, char c)
+{
+ static const char * hexchars = "0123456789abcdef";
+ out << hexchars[(c>>4)&0xf] << hexchars[c&0xf];
+}
+
+void writeURL(ostream & out, const string & s)
+{
+ static char * dont_escape = nullptr;
+ if(!dont_escape)
+ {
+ dont_escape = new char [256];
+ memset(dont_escape, 0, 256 * sizeof(char));
+ / *
+ * http://tools.ietf.org/html/rfc3986#section-2
+ *
+ * Also includes '%', in case that the original url has been escaped
+ * /
+ const char * no_escape_chars = ":/?#[]@!$&'()*+,;="
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz"
+ "0123456789"
+ "-._~"
+ "%";
+ while(*no_escape_chars)
+ dont_escape[(int)*(no_escape_chars++)] = 1;
+ }
+
+ for (auto iter = s.begin(); iter != s.end(); ++iter)
+ {
+ char c = *iter;
+ if(dont_escape[(int)c])
+ out << c;
+ else
+ {
+ out << '%';
+ writeHEX(out, c);
+ }
+ }
+}
+*/
+
+void writeJSON(ostream & out, const string & s)
+{
+ for(auto c : s)
+ {
+ switch (c)
+ {
+ case '\\': out << "\\\\"; break;
+ case '"': out << "\\\""; break;
+ case '\'': out << "\\\'"; break;
+ case '/': out << "\\/"; break;
+ case '\b': out << "\\b"; break;
+ case '\f': out << "\\f"; break;
+ case '\n': out << "\\n"; break;
+ case '\r': out << "\\r"; break;
+ case '\t': out << "\\t"; break;
+ default: out << c; break;
+ }
+ }
+}
+
+void writeAttribute(std::ostream & out, const std::string & s)
+{
+ for (auto c : s)
+ {
+ switch(c)
+ {
+ case '&':
+ out << "&amp;";
+ break;
+ case '\"':
+ out << "&quot;";
+ break;
+ case '\'':
+ out << "&apos;";
+ break;
+ case '<':
+ out << "&lt;";
+ break;
+ case '>':
+ out << "&gt;";
+ break;
+ case '`': // for IE: http://html5sec.org/#59
+ out << "&#96;";
+ break;
+ default:
+ out << c;
+ }
+ }
+}
+
+} //namespace pdf2htmlEX
diff --git a/src/util/encoding.h b/src/util/encoding.h
new file mode 100644
index 0000000..c4d7732
--- /dev/null
+++ b/src/util/encoding.h
@@ -0,0 +1,41 @@
+/*
+ * Encodings used in HTML
+ *
+ * by WangLu
+ * 2013.02.15
+ */
+
+#ifndef ENCODING_H__
+#define ENCODING_H__
+
+#include <string>
+#include <iostream>
+
+#include <CharTypes.h>
+
+namespace pdf2htmlEX {
+
+/*
+ * Escape necessary characters, and map Unicode to UTF-8
+ */
+void writeUnicodes(std::ostream & out, const Unicode * u, int uLen);
+
+
+/*
+ * URL escaping
+ */
+//void writeURL(std::ostream & out, const std::string & s);
+
+/*
+ * JSON escaping
+ */
+void writeJSON(std::ostream & out, const std::string & s);
+
+/*
+ * HTML tag attribute escaping
+ */
+void writeAttribute(std::ostream & out, const std::string & s);
+
+} // namespace pdf2htmlEX
+
+#endif //ENCODING_H__
diff --git a/src/util/ffw.c b/src/util/ffw.c
new file mode 100644
index 0000000..b88efce
--- /dev/null
+++ b/src/util/ffw.c
@@ -0,0 +1,485 @@
+/*
+ * ffw.c: Fontforge wrapper
+ *
+ * Processing fonts using Fontforge
+ *
+ * Copyright (C) 2012-2014 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <assert.h>
+#include <math.h>
+
+#include <fontforge.h>
+#include <baseviews.h>
+
+#include "ffw.h"
+
+static real EPS=1e-6;
+
+static inline int min(int a, int b)
+{
+ return (a<b)?a:b;
+}
+
+static FontViewBase * cur_fv = NULL;
+static Encoding * original_enc = NULL;
+static Encoding * unicodefull_enc = NULL;
+static Encoding * enc_head = NULL;
+
+static void err(const char * format, ...)
+{
+ va_list al;
+ va_start(al, format);
+ vfprintf(stderr, format, al);
+ va_end(al);
+ exit(-1);
+}
+static char * strcopy(const char * str)
+{
+ if(str == NULL) return NULL;
+
+ char * _ = strdup(str);
+ if(!_)
+ err("Not enough memory");
+ return _;
+}
+
+static void dumb_logwarning(const char * format, ...) { }
+
+static void dumb_post_error(const char * title, const char * error, ...) { }
+
+void ffw_init(int debug)
+{
+ InitSimpleStuff();
+ if ( default_encoding==NULL )
+ default_encoding=FindOrMakeEncoding("ISO8859-1");
+ if ( default_encoding==NULL )
+ default_encoding=&custom; /* In case iconv is broken */
+
+ if(!debug)
+ {
+ //disable error output of Fontforge
+ ui_interface->logwarning = &dumb_logwarning;
+ ui_interface->post_error = &dumb_post_error;
+ }
+
+ original_enc = FindOrMakeEncoding("original");
+ unicodefull_enc = FindOrMakeEncoding("UnicodeFull");
+
+ {
+ Val v;
+ v.type = v_int;
+ v.u.ival = 1;
+ SetPrefs("DetectDiagonalStems", &v, NULL);
+ }
+}
+
+void ffw_finalize(void)
+{
+ while(enc_head)
+ {
+ Encoding * next = enc_head->next;
+ free((void*)enc_head->enc_name);
+ free(enc_head->unicode);
+ if(enc_head->psnames)
+ {
+ int i;
+ for(i = 0; i < enc_head->char_cnt; ++i)
+ free((void*)enc_head->psnames[i]);
+ free(enc_head->psnames);
+ }
+ free(enc_head);
+ enc_head = next;
+ }
+}
+
+long ffw_get_version(void)
+{
+ return FONTFORGE_VERSIONDATE_RAW;
+}
+
+void ffw_new_font()
+{
+ assert((cur_fv == NULL) && "Previous font is not destroyed");
+ cur_fv = FVAppend(_FontViewCreate(SplineFontNew()));
+}
+
+void ffw_load_font(const char * filename)
+{
+ assert((cur_fv == NULL) && "Previous font is not destroyed");
+
+ char * _filename = strcopy(filename);
+ SplineFont * font = LoadSplineFont(_filename, 1);
+
+ free(_filename);
+
+ if(!font)
+ err("Cannot load font %s\n", filename);
+
+ if(!font->fv)
+ FVAppend(_FontViewCreate(font));
+
+ assert(font->fv);
+
+ cur_fv = font->fv;
+}
+
+/*
+ * Fight again dirty stuffs
+ */
+void ffw_prepare_font(void)
+{
+ memset(cur_fv->selected, 1, cur_fv->map->enccount);
+ // remove kern
+ FVRemoveKerns(cur_fv);
+ FVRemoveVKerns(cur_fv);
+
+ /*
+ * Remove Alternate Unicodes
+ * We never use them because we will do a force encoding
+ */
+ int i;
+ SplineFont * sf = cur_fv->sf;
+ for(i = 0; i < sf->glyphcnt; ++i)
+ {
+ SplineChar * sc = sf->glyphs[i];
+ if(sc)
+ {
+ struct altuni * p = sc->altuni;
+ if(p)
+ {
+ AltUniFree(p);
+ sc->altuni = NULL;
+ }
+ }
+ }
+
+ /*
+ * Wipe out font name
+ * browsers may rejects fonts with malformed font names
+ */
+ free(sf->fontname);
+ sf->fontname = strcopy("");
+}
+
+void ffw_save(const char * filename)
+{
+ char * _filename = strcopy(filename);
+ char * _ = strcopy("");
+
+ int r = GenerateScript(cur_fv->sf, _filename
+ , _, -1, -1, NULL, NULL, cur_fv->map, NULL, ly_fore);
+
+ free(_);
+ free(_filename);
+
+ if(!r)
+ err("Cannot save font to %s\n", filename);
+}
+void ffw_close(void)
+{
+ FontViewClose(cur_fv);
+ cur_fv = NULL;
+}
+
+static void ffw_do_reencode(Encoding * encoding, int force)
+{
+ assert(encoding);
+
+ if(force)
+ {
+ SFForceEncoding(cur_fv->sf, cur_fv->map, encoding);
+ }
+ else
+ {
+ EncMapFree(cur_fv->map);
+ cur_fv->map = EncMapFromEncoding(cur_fv->sf, encoding);
+ }
+ if(cur_fv->normal)
+ {
+ EncMapFree(cur_fv->normal);
+ cur_fv->normal = NULL;
+ }
+
+ SFReplaceEncodingBDFProps(cur_fv->sf, cur_fv->map);
+
+ free(cur_fv->selected);
+ cur_fv->selected = calloc(cur_fv->map->enccount, sizeof(char));
+}
+
+void ffw_reencode_glyph_order(void)
+{
+ ffw_do_reencode(original_enc, 0);
+}
+
+void ffw_reencode_unicode_full(void)
+{
+ ffw_do_reencode(unicodefull_enc, 0);
+}
+
+void ffw_reencode(const char * encname, int force)
+{
+ Encoding * enc = FindOrMakeEncoding(encname);
+ if(!enc)
+ err("Unknown encoding %s\n", encname);
+
+ ffw_do_reencode(enc, force);
+}
+
+void ffw_reencode_raw(int32 * mapping, int mapping_len, int force)
+{
+ Encoding * enc = calloc(1, sizeof(Encoding));
+ enc->only_1byte = enc->has_1byte = true;
+
+ int len = (mapping_len < 256) ? 256 : mapping_len;
+ enc->char_cnt = len;
+ enc->unicode = (int32_t*)malloc(len * sizeof(int32_t));
+ memcpy(enc->unicode, mapping, mapping_len * sizeof(int32_t));
+ if(mapping_len < 256)
+ {
+ int i;
+ for(i = mapping_len; i < 256; ++i)
+ enc->unicode[i] = -1;
+ }
+
+ enc->enc_name = strcopy("");
+
+ enc->next = enc_head;
+ enc_head = enc;
+
+ ffw_do_reencode(enc, force);
+}
+
+void ffw_reencode_raw2(char ** mapping, int mapping_len, int force)
+{
+ Encoding * enc = calloc(1, sizeof(Encoding));
+ enc->enc_name = strcopy("");
+ enc->char_cnt = mapping_len;
+ enc->unicode = (int32_t*)malloc(mapping_len * sizeof(int32_t));
+ enc->psnames = (char**)calloc(mapping_len, sizeof(char*));
+ int i;
+ for(i = 0; i < mapping_len; ++i)
+ {
+ if(mapping[i])
+ {
+ enc->unicode[i] = UniFromName(mapping[i], ui_none, &custom);
+ enc->psnames[i] = strcopy(mapping[i]);
+ }
+ else
+ {
+ enc->unicode[i] = -1;
+ }
+ }
+
+ enc->next = enc_head;
+ enc_head = enc;
+
+ ffw_do_reencode(enc, force);
+}
+
+void ffw_cidflatten(void)
+{
+ if(!cur_fv->sf->cidmaster)
+ {
+ fprintf(stderr, "Cannot flatten a non-CID font\n");
+ return;
+ }
+ SFFlatten(cur_fv->sf->cidmaster);
+}
+
+/*
+ * There is no check if a glyph with the same unicode exists!
+ * TODO: let FontForge fill in the standard glyph name <- or maybe this might cause collision?
+ */
+void ffw_add_empty_char(int32_t unicode, int width)
+{
+ SplineChar * sc = SFMakeChar(cur_fv->sf, cur_fv->map, cur_fv->map->enccount);
+ char buffer[400];
+ SCSetMetaData(sc,
+ strcopy(StdGlyphName(buffer, unicode,
+ cur_fv->sf->uni_interp, cur_fv->sf->for_new_glyphs)),
+ unicode, sc->comment);
+ SCSynchronizeWidth(sc, width, sc->width, cur_fv);
+}
+
+int ffw_get_em_size(void)
+{
+ return cur_fv->sf->ascent + cur_fv->sf->descent;
+}
+
+void ffw_fix_metric()
+{
+ double ascent, descent;
+ ffw_get_metric(&ascent, &descent);
+ ffw_set_metric(ascent, descent);
+}
+
+void ffw_get_metric(double * ascent, double * descent)
+{
+ SplineFont * sf = cur_fv->sf;
+
+ DBounds bb;
+ SplineFontFindBounds(sf, &bb);
+
+ int em = sf->ascent + sf->descent;
+
+ if (em > 0)
+ {
+ *ascent = ((double)bb.maxy) / em;
+ *descent = ((double)bb.miny) / em;
+ }
+ else
+ {
+ *ascent = *descent = 0;
+ }
+}
+
+void ffw_set_metric(double ascent, double descent)
+{
+ SplineFont * sf = cur_fv->sf;
+ struct pfminfo * info = &sf->pfminfo;
+
+ SFDefaultOS2Info(info, sf, sf->fontname);
+ info->pfmset = 1;
+ sf->changed = 1;
+
+ int em = sf->ascent + sf->descent;
+ int a = floor(ascent * em + 0.5);
+ int d = floor(descent * em + 0.5);
+
+ if(a < 0) a = 0;
+ if(d > 0) d = 0;
+
+ /*
+ sf->ascent = min(a, em);
+ sf->descent = em - bb.maxy;
+ */
+
+ /*
+ * The embedded fonts are likely to have inconsistent values for the 3 sets of ascent/descent
+ * PDF viewers don't care, since they don't even use these values
+ * But have to unify them, for different browsers on different platforms
+ * Things may become easier when there are CSS rules for baseline-based positioning.
+ */
+ info->os2_winascent = a;
+ info->os2_typoascent = a;
+ info->hhead_ascent = a;
+ info->winascent_add = 0;
+ info->typoascent_add = 0;
+ info->hheadascent_add = 0;
+
+ info->os2_windescent = -d;
+ info->os2_typodescent = d;
+ info->hhead_descent = d;
+ info->windescent_add = 0;
+ info->typodescent_add = 0;
+ info->hheaddescent_add = 0;
+
+ info->os2_typolinegap = 0;
+ info->linegap = 0;
+}
+
+/*
+ * TODO:bitmap, reference have not been considered in this function
+ */
+void ffw_set_widths(int * width_list, int mapping_len,
+ int stretch_narrow, int squeeze_wide)
+{
+ SplineFont * sf = cur_fv->sf;
+
+ if(sf->onlybitmaps
+ && cur_fv->active_bitmap != NULL
+ && sf->bitmaps != NULL)
+ {
+ printf("TODO: width vs bitmap\n");
+ }
+
+ EncMap * map = cur_fv->map;
+ int i;
+ int imax = min(mapping_len, map->enccount);
+ for(i = 0; i < imax; ++i)
+ {
+ /*
+ * Don't mess with it if the glyphs is not used.
+ */
+ if(width_list[i] == -1)
+ {
+ continue;
+ }
+
+ int j = map->map[i];
+ if(j == -1) continue;
+
+ SplineChar * sc = sf->glyphs[j];
+ if(sc == NULL)
+ {
+ sc = SFMakeChar(cur_fv->sf, cur_fv->map, j);
+ }
+ else if(((sc->width > EPS)
+ && (((sc->width > width_list[i] + EPS) && (squeeze_wide))
+ || ((sc->width < width_list[i] - EPS) && (stretch_narrow)))))
+ {
+ real transform[6];
+ transform[0] = ((double)width_list[i]) / (sc->width);
+ transform[3] = 1.0;
+ transform[1] = transform[2] = transform[4] = transform[5] = 0;
+ FVTrans(cur_fv, sc, transform, NULL, fvt_alllayers | fvt_dontmovewidth);
+ }
+
+ SCSynchronizeWidth(sc, width_list[i], sc->width, cur_fv);
+ }
+}
+
+void ffw_import_svg_glyph(int code, const char * filename, double ox, double oy, double width)
+{
+ int enc = SFFindSlot(cur_fv->sf, cur_fv->map, code, "");
+ if(enc == -1)
+ return;
+
+ SplineChar * sc = SFMakeChar(cur_fv->sf, cur_fv->map, enc);
+
+ memset(cur_fv->selected, 0, cur_fv->map->enccount);
+ cur_fv->selected[enc] = 1;
+ int ok = FVImportImages(cur_fv, (char*)filename, fv_svg, 0, -1);
+ if(!ok)
+ err("Import SVG glyph failed");
+
+ // correct origin and width
+ {
+ int a = cur_fv->sf->ascent;
+ int d = cur_fv->sf->descent;
+ real transform[6];
+ transform[0] = 1.0;
+ transform[3] = 1.0;
+ transform[1] = transform[2] = 0.0;
+ transform[4] = -ox * (a+d);
+ transform[5] = -oy * (a+d) + d;
+ FVTrans(cur_fv, sc, transform, NULL, fvt_alllayers | fvt_dontmovewidth);
+
+ SCSynchronizeWidth(sc, floor(width * (a+d) + 0.5), sc->width, cur_fv);
+ }
+}
+
+void ffw_auto_hint(void)
+{
+ // convert to quadratic
+ if(!(cur_fv->sf->layers[ly_fore].order2))
+ {
+ SFCloseAllInstrs(cur_fv->sf);
+ SFConvertToOrder2(cur_fv->sf);
+ }
+ memset(cur_fv->selected, 1, cur_fv->map->enccount);
+ FVAutoHint(cur_fv);
+ FVAutoInstr(cur_fv);
+}
+
+void ffw_override_fstype(void)
+{
+ *(int16 *)(&cur_fv->sf->pfminfo.fstype) = 0;
+ cur_fv->sf->pfminfo.pfmset = true;
+ cur_fv->sf->changed = true;
+}
diff --git a/src/util/ffw.h b/src/util/ffw.h
new file mode 100644
index 0000000..a01ed79
--- /dev/null
+++ b/src/util/ffw.h
@@ -0,0 +1,74 @@
+/*
+ * ffw.h : Fontforge Wrapper
+ *
+ * Processing fonts using Fontforge
+ *
+ * fontforge.h cannot be included in C++
+ * So this wrapper in C publishes several functions we need
+ *
+ * by WangLu
+ * 2012.09.03
+ */
+
+
+#ifdef __cplusplus
+#include <cstdint>
+namespace pdf2htmlEX {
+extern "C" {
+#else
+#include <stdint.h>
+#endif
+
+////////////////////////
+// global
+void ffw_init(int debug);
+void ffw_finalize(void);
+long ffw_get_version(void);
+
+////////////////////////
+// load & save
+void ffw_new_font();
+void ffw_load_font(const char * filename);
+void ffw_prepare_font(void);
+
+void ffw_save(const char * filename);
+void ffw_close(void);
+
+////////////////////////
+// encoding
+void ffw_reencode_glyph_order(void);
+void ffw_reencode_unicode_full(void);
+void ffw_reencode_raw(int32_t * mapping, int mapping_len, int force);
+void ffw_reencode_raw2(char ** mapping, int mapping_len, int force);
+
+void ffw_cidflatten(void);
+// add a new empty char into the font
+void ffw_add_empty_char(int32_t unicode, int width);
+
+////////////////////////
+// metrics
+int ffw_get_em_size(void);
+// manipulate ascent and descent
+// ascent is between 0 and 1
+// descent is between -1 and 0
+void ffw_fix_metric();
+// get ascent/descent based on the shape
+void ffw_get_metric(double * ascent, double * descent);
+// set corresponding fields
+void ffw_set_metric(double ascent, double descent);
+
+void ffw_set_widths(int * width_list, int mapping_len,
+ int stretch_narrow, int squeeze_wide);
+
+////////////////////////
+// others
+// (ox,oy) is the position of the true origin, fractions related to em_size
+// also true for glyph_width
+void ffw_import_svg_glyph(int code, const char * filename, double ox, double oy, double glyph_width);
+void ffw_auto_hint(void);
+void ffw_override_fstype(void);
+
+#ifdef __cplusplus
+}
+}
+#endif
diff --git a/src/util/math.cc b/src/util/math.cc
new file mode 100644
index 0000000..1ddabce
--- /dev/null
+++ b/src/util/math.cc
@@ -0,0 +1,90 @@
+#include <cstring>
+#include <limits>
+#include <algorithm>
+
+#include "math.h"
+
+using std::min;
+using std::max;
+
+namespace pdf2htmlEX {
+
+void tm_transform(const double * tm, double & x, double & y, bool is_delta)
+{
+ double xx = x, yy = y;
+ x = tm[0] * xx + tm[2] * yy;
+ y = tm[1] * xx + tm[3] * yy;
+ if(!is_delta)
+ {
+ x += tm[4];
+ y += tm[5];
+ }
+}
+
+void tm_multiply(double * tm_left, const double * tm_right)
+{
+ double old[4];
+ memcpy(old, tm_left, sizeof(old));
+
+ tm_left[0] = old[0] * tm_right[0] + old[2] * tm_right[1];
+ tm_left[1] = old[1] * tm_right[0] + old[3] * tm_right[1];
+ tm_left[2] = old[0] * tm_right[2] + old[2] * tm_right[3];
+ tm_left[3] = old[1] * tm_right[2] + old[3] * tm_right[3];
+ tm_left[4] += old[0] * tm_right[4] + old[2] * tm_right[5];
+ tm_left[5] += old[1] * tm_right[4] + old[3] * tm_right[5];
+}
+
+void tm_transform_bbox(const double * tm, double * bbox)
+{
+ double & x1 = bbox[0];
+ double & y1 = bbox[1];
+ double & x2 = bbox[2];
+ double & y2 = bbox[3];
+ double _[4][2];
+ _[0][0] = _[1][0] = x1;
+ _[0][1] = _[2][1] = y1;
+ _[2][0] = _[3][0] = x2;
+ _[1][1] = _[3][1] = y2;
+
+ x1 = y1 = std::numeric_limits<double>::max();
+ x2 = y2 = std::numeric_limits<double>::min();
+ for(int i = 0; i < 4; ++i)
+ {
+ auto & x = _[i][0];
+ auto & y = _[i][1];
+ tm_transform(tm, x, y);
+ if(x < x1) x1 = x;
+ if(x > x2) x2 = x;
+ if(y < y1) y1 = y;
+ if(y > y2) y2 = y;
+ }
+}
+
+bool bbox_intersect(const double * bbox1, const double * bbox2, double * result)
+{
+ double x0, y0, x1, y1;
+
+ x0 = max(min(bbox1[0], bbox1[2]), min(bbox2[0], bbox2[2]));
+ x1 = min(max(bbox1[0], bbox1[2]), max(bbox2[0], bbox2[2]));
+
+ if (x0 >= x1)
+ return false;
+
+ y0 = max(min(bbox1[1], bbox1[3]), min(bbox2[1], bbox2[3]));
+ y1 = min(max(bbox1[1], bbox1[3]), max(bbox2[1], bbox2[3]));
+
+ if (y0 >= y1)
+ return false;
+
+ if (result)
+ {
+ result[0] = x0;
+ result[1] = y0;
+ result[2] = x1;
+ result[3] = y1;
+ }
+ return true;
+}
+
+} //namespace pdf2htmlEX
+
diff --git a/src/util/math.h b/src/util/math.h
new file mode 100644
index 0000000..8302a93
--- /dev/null
+++ b/src/util/math.h
@@ -0,0 +1,59 @@
+/*
+ * Math functions
+ *
+ * by WangLu
+ * 2012.11.29
+ */
+
+#ifndef MATH_H__
+#define MATH_H__
+
+#include <cmath>
+
+#include "const.h"
+
+namespace pdf2htmlEX {
+
+static inline double round(double x) { return (std::abs(x) > EPS) ? x : 0.0; }
+static inline bool equal(double x, double y) { return std::abs(x-y) <= EPS; }
+static inline bool is_positive(double x) { return x > EPS; }
+static inline bool tm_equal(const double * tm1, const double * tm2, int size = 6)
+{
+ for(int i = 0; i < size; ++i)
+ if(!equal(tm1[i], tm2[i]))
+ return false;
+ return true;
+}
+
+static inline void tm_init(double * tm)
+{
+ tm[0] = tm[3] = 1;
+ tm[1] = tm[2] = tm[4] = tm[5] = 0;
+}
+
+static inline void tm_multiply(double * result, const double * m1, const double * m2)
+{
+ result[0] = m1[0] * m2[0] + m1[2] * m2[1];
+ result[1] = m1[1] * m2[0] + m1[3] * m2[1];
+ result[2] = m1[0] * m2[2] + m1[2] * m2[3];
+ result[3] = m1[1] * m2[2] + m1[3] * m2[3];
+ result[4] = m1[0] * m2[4] + m1[2] * m2[5] + m1[4];
+ result[5] = m1[1] * m2[4] + m1[3] * m2[5] + m1[5];
+}
+
+static inline double hypot(double x, double y) { return std::sqrt(x*x+y*y); }
+
+void tm_transform(const double * tm, double & x, double & y, bool is_delta = false);
+void tm_multiply(double * tm_left, const double * tm_right);
+void tm_transform_bbox(const double * tm, double * bbox);
+/**
+ * Calculate the intersection of 2 boxes.
+ * If they are intersecting, store the result to result (if not null) and return true.
+ * Otherwise return false, and result is not touched.
+ * Param result can be same as one of bbox1 and bbox2.
+ * Data in boxes are expected in the order of (x0, y0, x1, y1).
+ */
+bool bbox_intersect(const double * bbox1, const double * bbox2, double * result = nullptr);
+
+} //namespace pdf2htmlEX
+#endif //MATH_H__
diff --git a/src/util/mingw.cc b/src/util/mingw.cc
new file mode 100644
index 0000000..5d75be0
--- /dev/null
+++ b/src/util/mingw.cc
@@ -0,0 +1,64 @@
+/*
+ * Win32 specific functions
+ *
+ * by MarcSanfacon
+ * 2014.01.13
+ */
+
+#ifdef __MINGW32__
+
+#include <string>
+#include <iostream>
+#include <cstdio>
+#include <cstdlib>
+#include <limits.h>
+#include <libgen.h>
+
+#include "mingw.h"
+
+using namespace std;
+
+char* mkdtemp(char* temp)
+{
+ char *filename = nullptr;
+ if (temp != nullptr) {
+ filename = mktemp(temp);
+ if (filename != nullptr) {
+ if (_mkdir(temp) != 0) {
+ filename = nullptr;
+ }
+ }
+ }
+
+ return filename;
+}
+
+namespace pdf2htmlEX {
+string get_exec_dir(char *dir)
+{
+ // Under Windows, the default data_dir is under /data in the pdf2htmlEX directory
+ string s = dirname(dir);
+ if (s == ".") {
+ char* wd(getcwd(nullptr, PATH_MAX));
+ s = wd;
+ free(wd);
+ }
+ s += "/data";
+ return s;
+}
+
+string get_tmp_dir()
+{
+ // Under Windows, the temp path is not under /tmp, find it.
+ char *tmp = getenv("TMP");
+ if (tmp == nullptr) {
+ tmp = getenv("TEMP");
+ }
+
+ return tmp != nullptr ? string(tmp) + "/" : "/";
+}
+
+} // namespace pdf2htmlEX;
+
+#endif //__MINGW32__
+
diff --git a/src/util/mingw.h b/src/util/mingw.h
new file mode 100644
index 0000000..89abf8a
--- /dev/null
+++ b/src/util/mingw.h
@@ -0,0 +1,29 @@
+/*
+ * Win32 specific functions
+ *
+ * by MarcSanfacon
+ * 2014.01.13
+ */
+
+#ifndef MINGW_H__
+#define MINGW_H__
+
+#ifdef __MINGW32__
+
+#include <io.h>
+
+char *mkdtemp(char *temp);
+
+#include <direct.h>
+#define mkdir(A, B) _mkdir(A)
+#define stat _stat
+
+namespace pdf2htmlEX {
+ std::string get_exec_dir(char *dir);
+ std::string get_tmp_dir();
+} // namespace pdf2htmlEX
+
+#endif //__MINGW32__
+
+#endif //MINGW_H__
+
diff --git a/src/util/misc.cc b/src/util/misc.cc
new file mode 100644
index 0000000..e2572c0
--- /dev/null
+++ b/src/util/misc.cc
@@ -0,0 +1,66 @@
+/*
+ * Misc functions
+ *
+ *
+ * by WangLu
+ * 2012.08.10
+ */
+
+#include <map>
+
+#include "misc.h"
+
+using std::cerr;
+using std::endl;
+using std::string;
+using std::map;
+using std::ostream;
+
+namespace pdf2htmlEX {
+
+void css_fix_rectangle_border_width(double x1, double y1,
+ double x2, double y2,
+ double border_width,
+ double & x, double & y, double & w, double & h,
+ double & border_top_bottom_width,
+ double & border_left_right_width)
+{
+ w = x2 - x1;
+ if(w > border_width)
+ {
+ w -= border_width;
+ border_left_right_width = border_width;
+ }
+ else
+ {
+ border_left_right_width = border_width + w/2;
+ w = 0;
+ }
+ x = x1 - border_width / 2;
+
+ h = y2 - y1;
+ if(h > border_width)
+ {
+ h -= border_width;
+ border_top_bottom_width = border_width;
+ }
+ else
+ {
+ border_top_bottom_width = border_width + h/2;
+ h = 0;
+ }
+ y = y1 - border_width / 2;
+}
+
+ostream & operator << (ostream & out, const GfxRGB & rgb)
+{
+ auto flags= out.flags();
+ out << std::dec << "rgb("
+ << (int)colToByte(rgb.r) << ","
+ << (int)colToByte(rgb.g) << ","
+ << (int)colToByte(rgb.b) << ")";
+ out.flags(flags);
+ return out;
+}
+
+} // namespace pdf2htmlEX
diff --git a/src/util/misc.h b/src/util/misc.h
new file mode 100644
index 0000000..9032e4e
--- /dev/null
+++ b/src/util/misc.h
@@ -0,0 +1,39 @@
+/*
+ * Help classes and Functions
+ *
+ * by WangLu
+ * 2012.08.10
+ */
+
+
+#ifndef UTIL_H__
+#define UTIL_H__
+
+#include <iostream>
+
+#include <GfxState.h>
+
+#include "util/const.h"
+
+namespace pdf2htmlEX {
+
+static inline long long hash_ref(const Ref * id)
+{
+ return (((long long)(id->num)) << (sizeof(id->gen)*8)) | (id->gen);
+}
+
+/*
+ * In PDF, edges of the rectangle are in the middle of the borders
+ * In HTML, edges are completely outside the rectangle
+ */
+void css_fix_rectangle_border_width(double x1, double y1, double x2, double y2,
+ double border_width,
+ double & x, double & y, double & w, double & h,
+ double & border_top_bottom_width,
+ double & border_left_right_width);
+
+std::ostream & operator << (std::ostream & out, const GfxRGB & rgb);
+
+} // namespace pdf2htmlEX
+
+#endif //UTIL_H__
diff --git a/src/util/namespace.h b/src/util/namespace.h
new file mode 100644
index 0000000..46dcd0f
--- /dev/null
+++ b/src/util/namespace.h
@@ -0,0 +1,21 @@
+/*
+ * namespace.h
+ *
+ * specifying common used namespace
+ *
+ * by WangLu
+ */
+
+#ifndef NAMESPACE_H__
+#define NAMESPACE_H__
+
+using std::hex;
+using std::dec;
+using std::string;
+using std::endl;
+using std::make_pair;
+using std::ifstream;
+using std::ofstream;
+
+#endif // NAMESPACE_H__
+
diff --git a/src/util/path.cc b/src/util/path.cc
new file mode 100644
index 0000000..5abc7a5
--- /dev/null
+++ b/src/util/path.cc
@@ -0,0 +1,141 @@
+/*
+ * Functions manipulating filenames and paths
+ *
+ * by WangLu
+ * 2012.11.29
+ */
+
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <cstring>
+
+#include "path.h"
+
+#ifdef __MINGW32__
+#include "util/mingw.h"
+#endif
+
+using std::string;
+
+namespace pdf2htmlEX {
+
+void create_directories(const string & path)
+{
+ if(path.empty()) return;
+
+ size_t idx = path.rfind('/');
+ if(idx != string::npos)
+ {
+ create_directories(path.substr(0, idx));
+ }
+
+ int r = mkdir(path.c_str(), S_IRWXU);
+ if(r != 0)
+ {
+ if(errno == EEXIST)
+ {
+ struct stat stat_buf;
+ if((stat(path.c_str(), &stat_buf) == 0) && S_ISDIR(stat_buf.st_mode))
+ return;
+ }
+
+ throw string("Cannot create directory: ") + path;
+ }
+}
+
+bool sanitize_filename(string & filename)
+{
+ string sanitized;
+ bool format_specifier_found = false;
+
+ for(size_t i = 0; i < filename.size(); i++)
+ {
+ if('%' == filename[i])
+ {
+ if(format_specifier_found)
+ {
+ sanitized.push_back('%');
+ sanitized.push_back('%');
+ }
+ else
+ {
+ // We haven't found the format specifier yet, so see if we can use this one as a valid formatter
+ size_t original_i = i;
+ string tmp;
+ tmp.push_back('%');
+ while(++i < filename.size())
+ {
+ tmp.push_back(filename[i]);
+
+ // If we aren't still in option specifiers, stop looking
+ if(!strchr("0123456789", filename[i]))
+ {
+ break;
+ }
+ }
+
+ // Check to see if we yielded a valid format specifier
+ if('d' == tmp[tmp.size()-1])
+ {
+ // Found a valid integer format
+ sanitized.append(tmp);
+ format_specifier_found = true;
+ }
+ else
+ {
+ // Not a valid format specifier. Just append the protected %
+ // and keep looking from where we left of in the search
+ sanitized.push_back('%');
+ sanitized.push_back('%');
+ i = original_i;
+ }
+ }
+ }
+ else
+ {
+ sanitized.push_back(filename[i]);
+ }
+ }
+
+ // Only sanitize if it is a valid format.
+ if(format_specifier_found)
+ {
+ filename.assign(sanitized);
+ }
+
+ return format_specifier_found;
+}
+
+bool is_truetype_suffix(const string & suffix)
+{
+ return (suffix == ".ttf") || (suffix == ".ttc") || (suffix == ".otf");
+}
+
+string get_filename (const string & path)
+{
+ size_t idx = path.rfind('/');
+ if(idx == string::npos)
+ return path;
+ else if (idx == path.size() - 1)
+ return "";
+ return path.substr(idx + 1);
+}
+
+string get_suffix(const string & path)
+{
+ string fn = get_filename(path);
+ size_t idx = fn.rfind('.');
+ if(idx == string::npos)
+ return "";
+ else
+ {
+ string s = fn.substr(idx);
+ for(auto & c : s)
+ c = tolower(c);
+ return s;
+ }
+}
+
+
+} //namespace pdf2htmlEX
diff --git a/src/util/path.h b/src/util/path.h
new file mode 100644
index 0000000..2a2a685
--- /dev/null
+++ b/src/util/path.h
@@ -0,0 +1,33 @@
+/*
+ * Function handling filenames and paths
+ *
+ * by WangLu
+ * 2012.11.29
+ */
+
+#ifndef PATH_H__
+#define PATH_H__
+
+#include <string>
+
+namespace pdf2htmlEX {
+
+void create_directories(const std::string & path);
+
+bool is_truetype_suffix(const std::string & suffix);
+
+std::string get_filename(const std::string & path);
+std::string get_suffix(const std::string & path);
+
+/**
+ * Sanitize all occurrences of '%' except for the first valid format specifier. Filename
+ * is only sanitized if a formatter is found, and the function returns true.
+ *
+ * @param filename the filename to be sanitized. Value will be modified.
+ *
+ * @return true if a format specifier was found, false otherwise.
+ */
+bool sanitize_filename(std::string & filename);
+
+} //namespace pdf2htmlEX
+#endif //PATH_H__
diff --git a/src/util/unicode.cc b/src/util/unicode.cc
new file mode 100644
index 0000000..4a2a034
--- /dev/null
+++ b/src/util/unicode.cc
@@ -0,0 +1,70 @@
+/*
+ * Unicode manipulation functions
+ *
+ * Copyright (C) 2012-2014 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#include <iostream>
+
+#include <GlobalParams.h>
+
+#include "pdf2htmlEX-config.h"
+
+#include "unicode.h"
+
+namespace pdf2htmlEX {
+
+using std::cerr;
+using std::endl;
+using std::ostream;
+
+Unicode map_to_private(CharCode code)
+{
+ Unicode private_mapping = (Unicode)(code + 0xE000);
+ if(private_mapping > 0xF8FF)
+ {
+ private_mapping = (Unicode)((private_mapping - 0xF8FF) + 0xF0000);
+ if(private_mapping > 0xFFFFD)
+ {
+ private_mapping = (Unicode)((private_mapping - 0xFFFFD) + 0x100000);
+ if(private_mapping > 0x10FFFD)
+ {
+ cerr << "Warning: all private use unicode are used" << endl;
+ }
+ }
+ }
+ return private_mapping;
+}
+
+Unicode unicode_from_font (CharCode code, GfxFont * font)
+{
+ if(!font->isCIDFont())
+ {
+ char * cname = dynamic_cast<Gfx8BitFont*>(font)->getCharName(code);
+ // may be untranslated ligature
+ if(cname)
+ {
+ Unicode ou = globalParams->mapNameToUnicodeText(cname);
+ if(!is_illegal_unicode(ou))
+ return ou;
+ }
+ }
+
+ return map_to_private(code);
+}
+
+Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font)
+{
+ if(len == 0)
+ return map_to_private(code);
+
+ if(len == 1)
+ {
+ if(!is_illegal_unicode(*u))
+ return *u;
+ }
+
+ return unicode_from_font(code, font);
+}
+
+} //namespace pdf2htmlEX
diff --git a/src/util/unicode.h b/src/util/unicode.h
new file mode 100644
index 0000000..2100695
--- /dev/null
+++ b/src/util/unicode.h
@@ -0,0 +1,84 @@
+/*
+ * Unicode manipulation functions
+ *
+ * by WangLu
+ * 2012.11.29
+ */
+
+#ifndef UNICODE_H__
+#define UNICODE_H__
+
+#include <GfxFont.h>
+#include <CharTypes.h>
+
+namespace pdf2htmlEX {
+
+/**
+ * Check whether a unicode character is illegal for the output HTML.
+ * Unlike PDF readers, browsers has special treatments for such characters (normally treated as
+ * zero-width space), regardless of metrics and glyphs provided by fonts. So these characters
+ * should be mapped to unicode private area to "cheat" browsers, at the cost of loosing actual
+ * unicode values in the HTML.
+ *
+ * The following chart shows illegal characters in HTML by webkit, mozilla, and pdf2htmlEX (p2h).
+ * pdf2htmlEX's illegal character set is the union of webkit's and mozilla's, plus illegal unicode
+ * characters. "[" and ")" surrounding ranges denote "inclusive" and "exclusive", respectively.
+ *
+ * 00(NUL)--09(\t)--0A(\n)--0D(\r)--20(SP)--7F(DEL)--9F(APC)--A0(NBSP)--AD(SHY)--061C(ALM)--1361(Ethiopic word space)
+ * webkit: [--------------------------------) [------------------) [-]
+ * moz: [--------------------------------) [---------] [-]
+ * p2h: [--------------------------------) [------------------] [-] [-] [-]
+ *
+ * 200B(ZWSP)--200C(ZWNJ)--200D(ZWJ)--200E(LRM)--200F(RLM)--2028(LSEP)--2029(PSEP)--202A(LRE)--202E(RL0)--2066(LRI)--2069(PDI)
+ * webkit: [-----------------------------------------------] [----------]
+ * moz: [-] [----------] [-] [-] [----------] [------------]
+ * p2h: [-----------------------------------------------] [-] [-] [----------] [------------]
+ *
+ * D800(surrogate)--DFFF(surrogate)--FEFF(ZWNBSP)--FFFC(ORC)--FFFE(non-char)--FFFF(non-char)
+ * webkit: [-] [-]
+ * moz:
+ * p2h: [------------------] [-] [-] [-----------------]
+ *
+ * Note: 0xA0 (no-break space) affects word-spacing; and if "white-space:pre" is specified,
+ * \n and \r can break line, \t can shift text, so they are considered illegal.
+ *
+ * Resources (retrieved at 2015-03-16)
+ * * webkit
+ * * Avoid querying the font cache for the zero-width space glyph ( https://bugs.webkit.org/show_bug.cgi?id=90673 )
+ * * treatAsZeroWidthSpace( https://github.com/WebKit/webkit/blob/17bbff7400393e9389b40cc84ce005f7cc954680/Source/WebCore/platform/graphics/FontCascade.h#L272 )
+ * * mozilla
+ * * IsInvalidChar( http://mxr.mozilla.org/mozilla-central/source/gfx/thebes/gfxTextRun.cpp#1973 )
+ * * IsBidiControl( http://mxr.mozilla.org/mozilla-central/source/intl/unicharutil/util/nsBidiUtils.h#114 )
+ * * Character encodings in HTML ( http://en.wikipedia.org/wiki/Character_encodings_in_HTML#HTML_character_references )
+ * * CSS Text Spec ( http://dev.w3.org/csswg/css-text/ )
+ * * unicode table ( http://unicode-table.com )
+ *
+ * TODO Web specs? IE?
+ *
+ */
+inline bool is_illegal_unicode(Unicode c)
+{
+ return (c < 0x20) || (c >= 0x7F && c <= 0xA0) || (c == 0xAD)
+ || (c == 0x061C) || (c == 0x1361)
+ || (c >= 0x200B && c <= 0x200F) || (c == 0x2028) || (c == 0x2029)
+ || (c >= 0x202A && c <= 0x202E) || (c >= 0x2066 && c <= 0x2069)
+ || (c >= 0xD800 && c <= 0xDFFF) || (c == 0xFEFF) || (c == 0xFFFC)
+ || (c == 0xFFFE) || (c == 0xFFFF);
+}
+
+Unicode map_to_private(CharCode code);
+
+/* * Try to determine the Unicode value directly from the information in the font */
+Unicode unicode_from_font (CharCode code, GfxFont * font);
+
+/*
+ * We have to use a single Unicode value to reencode fonts
+ * if we got multi-unicode values, it might be expanded ligature, try to restore it
+ * if we cannot figure it out at the end, use a private mapping
+ */
+Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font);
+
+
+} // namespace pdf2htmlEX
+
+#endif //UNICODE_H__