Import pdf2htmlex_0.14.6+ds.orig.tar.gz

[dgit import orig pdf2htmlex_0.14.6+ds.orig.tar.gz]
author: Johannes Schauer <josch@debian.org> 2015-07-27 16:07:02 +0200
committer: Johannes Schauer <josch@debian.org> 2015-07-27 16:07:02 +0200
commit: 385b4eca34c290f112d90e74925ba1963a4e0a94 (patch)
tree: 5b23566049318adbdd0d26c82735fa9b4072aae5 /src
60 files changed, 9571 insertions, 0 deletions
diff --git a/src/ArgParser.cc b/src/ArgParser.cc
new file mode 100644
index 0000000..19dcf32
--- /dev/null
+++ b/src/ArgParser.cc
@@ -0,0 +1,176 @@
+/*
+ * A wrapper of getopt
+ *
+ * by WangLu
+ * 2012.09.10
+ */
+
+#include <iostream>
+#include <unordered_map>
+#include <cassert>
+
+#include <getopt.h>
+
+#include "ArgParser.h"
+
+namespace pdf2htmlEX {
+
+using std::ostream;
+using std::cerr;
+using std::endl;
+using std::string;
+using std::vector;
+using std::unordered_map;
+using std::make_pair;
+using std::ostringstream;
+
+bool read_value(const char * arg, char * location)
+{
+    *location = arg[0];
+    return (arg[1] == 0);
+}
+
+bool read_value(const char * arg, std::string * location)
+{
+    *location = std::string(arg);
+    return true;
+}
+
+void dump_value(std::ostream & out, const std::string & v)
+{
+    out << '"' << v << '"';
+}
+
+ArgParser & ArgParser::add(const char * optname, const char * description, ArgParserCallBack callback, bool need_arg)
+{
+    // ArgEntry does not accept nullptr as optname nor description
+    if((!optname) || (!optname[0]))
+    {
+        // when optname is nullptr or "", it's optional, and description is dropped
+        optional_arg_entries.emplace_back(new ArgEntry<string, string>("", "", callback, need_arg));
+    }
+    else
+    {
+        arg_entries.emplace_back(new ArgEntry<string, string>(optname, (description ? description : ""), callback, need_arg));
+    }
+
+    return *this;
+}
+
+void ArgParser::parse(int argc, char ** argv) const
+{
+    //prepare optstring and longopts
+    vector<char> optstring;
+    optstring.reserve(2*arg_entries.size() + 1);
+    vector<struct option> longopts;
+    longopts.reserve(arg_entries.size() + 1);
+
+    unordered_map<int, const ArgEntryBase*> opt_map;
+
+    for(auto iter = arg_entries.begin(); iter != arg_entries.end(); ++iter)
+    {
+        const auto * p = iter->get();
+        if(p->shortname != 0)
+        {
+            optstring.push_back(p->shortname);
+            if(p->need_arg)
+                optstring.push_back(':');
+
+            int v = p->shortname;
+            if(!(opt_map.insert(make_pair(v, p)).second))
+            {
+                cerr << "Warning: duplicated shortname: " << v << endl;
+            }
+        }
+
+        if(p->name != "")
+        {
+            int v = (256 + (iter - arg_entries.begin()));
+            longopts.resize(longopts.size() + 1);
+            {
+                auto & cur = longopts.back();
+                cur.name = p->name.c_str();
+                cur.has_arg = ((p->need_arg) ? required_argument : no_argument);
+                cur.flag = nullptr;
+                cur.val = v;
+            }
+            if(!(opt_map.insert(make_pair(v, p)).second))
+            {
+                cerr << "Warning: duplicated long name: " << (p->name) << endl;
+            }
+        }
+    }
+
+    optstring.push_back(0);
+    longopts.resize(longopts.size() + 1);
+    {
+        auto & cur = longopts.back();
+        cur.name = 0;
+        cur.has_arg = 0;
+        cur.flag = 0;
+        cur.val = 0;
+    }
+
+    {
+        opterr = 1;
+        int r;
+        int idx;
+        while(true)
+        {
+            r = getopt_long(argc, argv, &optstring.front(), &longopts.front(), &idx); 
+            if(r == -1)
+                break;
+            assert(r != ':');
+            if(r == '?')
+            {
+                throw "";
+            }    
+
+            auto iter = opt_map.find(r);
+            assert(iter != opt_map.end());
+            iter->second->parse(optarg);
+        }
+    }
+
+    {
+        auto iter = optional_arg_entries.begin();
+        while((optind < argc) && (iter != optional_arg_entries.end())) 
+        {
+            (*(iter++))->parse(argv[optind++]);
+        }
+    }
+}
+
+void ArgParser::show_usage(ostream & out) const
+{
+    for(auto & entry : arg_entries)
+    {
+        entry->show_usage(out);
+    }
+}
+
+template<> const char * ArgParser::get_type_name<int>    (void) { return "int";    }
+template<> const char * ArgParser::get_type_name<double> (void) { return "fp";     }
+template<> const char * ArgParser::get_type_name<string> (void) { return "string"; }
+
+ArgParser::ArgEntryBase::ArgEntryBase(const char * name, const char * description, bool need_arg)
+    : shortname(0), name(name), description(description), need_arg(need_arg)
+{ 
+    size_t idx = this->name.rfind(',');
+    if(idx != string::npos)
+    {
+        if(idx+2 == this->name.size())
+        {
+            shortname = this->name[this->name.size()-1];
+            this->name = this->name.substr(0, idx);
+        }
+        else
+        {
+            cerr << "Warning: argument '" << this->name << "' cannot be parsed as a short option" << endl;
+        }
+    }
+}
+
+const int ArgParser::arg_col_width = 31;
+
+} // namespace pdf2htmlEX
diff --git a/src/ArgParser.h b/src/ArgParser.h
new file mode 100644
index 0000000..c0f8cde
--- /dev/null
+++ b/src/ArgParser.h
@@ -0,0 +1,219 @@
+/*
+ * A wrapper of getopt
+ *
+ * by WangLu
+ * 2012.09.10
+ */
+
+
+#ifndef ARGPARSER_H__
+#define ARGPARSER_H__
+
+#include <string>
+#include <vector>
+#include <ostream>
+#include <sstream>
+#include <memory>
+
+#ifndef nullptr
+#define nullptr (NULL)
+#endif
+
+namespace pdf2htmlEX {
+
+//helper
+template<class T>
+bool read_value(const char * arg, T * location)
+{
+    std::istringstream sin(arg);
+    return ((sin >> (*location)) && (sin.eof()));
+}
+
+extern bool read_value(const char * arg, char * location);
+extern bool read_value(const char * arg, std::string * location);
+
+template<class T>
+void dump_value(std::ostream & out, const T & v)
+{
+    out << v;
+}
+
+extern void dump_value(std::ostream & out, const std::string & v);
+
+class ArgParser
+{
+public:
+    typedef void (*ArgParserCallBack) (const char * arg);
+
+    /*
+     * The 1st is for arguments with callbacks(i.e. flags)
+     * The 2nd is for arguments linked to variables
+     *
+     * optname:
+     *  - if not nullptr, it should be the name of the arg, should be in the format of "<long name>[,<short char>]", e.g. "help,h"
+     *  - if nullptr, it denotes an optional arg, and description will be ignored
+     * description:
+     *  - if description is nullptr or "", the argument won't be shown in show_usage()
+     *
+     * location:
+     *  - if not nullptr, the argument for this arg is stored there
+     *  - if nullptr, this arg does not need arguments
+     */
+    ArgParser & add(const char * optname, const char * description, ArgParserCallBack callback, bool need_arg = false);
+    template <class T, class Tv>
+    ArgParser & add(const char * optname, T * location, const Tv & default_value, const char * description, bool dont_show_default = false);
+
+    void parse(int argc, char ** argv) const;
+    void show_usage(std::ostream & out) const;
+
+private:
+    // type names helper
+    template<class> 
+    static const char * get_type_name(void) { return "unknown"; }
+
+    struct ArgEntryBase
+    {
+        /* name or description cannot be nullptr */
+        ArgEntryBase(const char * name, const char * description, bool need_arg);
+        virtual ~ArgEntryBase() { }
+        char shortname;
+        std::string name;
+        std::string description;
+        bool need_arg;
+        virtual void parse (const char * arg) const = 0;
+        virtual void show_usage (std::ostream & out) const = 0;
+    };
+
+    template <class T, class Tv>
+    struct ArgEntry : public ArgEntryBase
+    {
+        ArgEntry(const char * name, 
+                const char * description,
+                ArgParserCallBack callback,
+                bool need_arg);
+
+        ArgEntry(const char * name, 
+                T * location, const Tv & default_value, 
+                const char * description, bool dont_show_default);
+
+        virtual void parse (const char * arg) const;
+        virtual void show_usage (std::ostream & out) const;
+
+    private:
+        T * location;
+        T default_value;
+        ArgParserCallBack callback;
+        bool dont_show_default;
+    };
+
+    std::vector<std::unique_ptr<ArgEntryBase>> arg_entries, optional_arg_entries;
+    static const int arg_col_width;
+};
+
+template<class T, class Tv>
+ArgParser & ArgParser::add(const char * optname, T * location, const Tv & default_value, const char * description, bool dont_show_default)
+{
+    // ArgEntry does not accept nullptr as optname nor description
+    if((!optname) || (!optname[0]))
+    {
+        // when optname is nullptr or "", it's optional, and description is dropped
+        optional_arg_entries.emplace_back(new ArgEntry<T, Tv>("", location, default_value, "", dont_show_default));
+    }
+    else
+    {
+        arg_entries.emplace_back(new ArgEntry<T, Tv>(optname, location, default_value, (description ? description : ""), dont_show_default));
+    }
+
+    return *this;
+}
+
+// Known types
+template<> const char * ArgParser::get_type_name<int>         (void);
+template<> const char * ArgParser::get_type_name<double>      (void);
+template<> const char * ArgParser::get_type_name<std::string> (void);
+
+template<class T, class Tv>
+ArgParser::ArgEntry<T, Tv>::ArgEntry(const char * name, const char * description, ArgParserCallBack callback, bool need_arg)
+    : ArgEntryBase(name, description, need_arg)
+    , location(nullptr)
+    , default_value()
+    , callback(callback)
+    , dont_show_default(true)
+{
+}
+    
+template<class T, class Tv>
+ArgParser::ArgEntry<T, Tv>::ArgEntry(const char * name, T * location, const Tv & default_value, const char * description, bool dont_show_default)
+    : ArgEntryBase(name, description, (location != nullptr))
+    , location(location)
+    , default_value(default_value)
+    , callback(nullptr)
+    , dont_show_default(dont_show_default)
+{ 
+    if(need_arg)
+        *location = T(default_value);
+}
+
+template<class T, class Tv>
+void ArgParser::ArgEntry<T, Tv>::parse(const char * arg) const
+{ 
+    if(need_arg)
+    { 
+        if(!arg)
+            throw std::string("Missing argument of option: --") + name;
+
+        if((location != nullptr) && (!read_value(arg, location)))
+            throw std::string("Invalid argument: ") + arg;
+    }
+
+    if(callback)
+        (*callback)(arg); 
+}
+
+template<class T, class Tv>
+void ArgParser::ArgEntry<T, Tv>::show_usage(std::ostream & out) const
+{ 
+    if(description.empty())
+        return;
+
+    std::ostringstream sout;
+    sout << "  ";
+
+    if(shortname != 0)
+    {
+        sout << "-" << shortname;
+    }
+
+    if(name != "")
+    {
+        if(shortname != 0)
+            sout << ",";
+        sout << "--" << name;
+    }
+
+    if(need_arg)
+    {
+        sout << " <" << get_type_name<T>() << ">";
+    }
+
+    std::string s = sout.str();
+    out << s;
+
+    for(int i = s.size(); i < arg_col_width; ++i)
+        out << ' ';
+    
+    out << " " << description;
+    
+    if(need_arg && !dont_show_default)
+    {
+        out << " (default: ";
+        dump_value(out, default_value);
+        out << ")";	
+    }
+    
+    out << std::endl;
+}
+
+} // namespace ArgParser
+
+#endif //ARGPARSER_H__
diff --git a/src/BackgroundRenderer/BackgroundRenderer.cc b/src/BackgroundRenderer/BackgroundRenderer.cc
new file mode 100644
index 0000000..dbd7137
--- /dev/null
+++ b/src/BackgroundRenderer/BackgroundRenderer.cc
@@ -0,0 +1,130 @@
+/*
+ * Background renderer
+ * Render all those things not supported as Image
+ *
+ * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#include <poppler-config.h>
+
+#include "HTMLRenderer/HTMLRenderer.h"
+#include "Param.h"
+
+#include "BackgroundRenderer.h"
+#include "SplashBackgroundRenderer.h"
+#if ENABLE_SVG
+#include "CairoBackgroundRenderer.h"
+#endif
+
+namespace pdf2htmlEX {
+
+std::unique_ptr<BackgroundRenderer> BackgroundRenderer::getBackgroundRenderer(const std::string & format, HTMLRenderer * html_renderer, const Param & param)
+{
+#ifdef ENABLE_LIBPNG
+    if(format == "png")
+    {
+        return std::unique_ptr<BackgroundRenderer>(new SplashBackgroundRenderer(format, html_renderer, param));
+    }
+#endif
+#ifdef ENABLE_LIBJPEG
+    if(format == "jpg")
+    {
+        return std::unique_ptr<BackgroundRenderer>(new SplashBackgroundRenderer(format, html_renderer, param));
+    }
+#endif
+#if ENABLE_SVG
+    if (format == "svg")
+    {
+        return std::unique_ptr<BackgroundRenderer>(new CairoBackgroundRenderer(html_renderer, param));
+    }
+#endif
+
+    return nullptr;
+}
+
+std::unique_ptr<BackgroundRenderer> BackgroundRenderer::getFallbackBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param)
+{
+    if (param.bg_format == "svg" && param.svg_node_count_limit >= 0)
+        return std::unique_ptr<BackgroundRenderer>(new SplashBackgroundRenderer("", html_renderer, param));
+    return nullptr;
+}
+
+void BackgroundRenderer::proof_begin_text_object(GfxState *state, OutputDev * dev)
+{
+    if (!proof_state)
+    {
+        PDFRectangle rect(0, 0, state->getPageWidth(), state->getPageHeight());
+        proof_state.reset(new GfxState(state->getHDPI(), state->getVDPI(), &rect, state->getRotate(), dev->upsideDown()));
+        proof_state->setFillColorSpace(new GfxDeviceRGBColorSpace());
+        proof_state->setStrokeColorSpace(new GfxDeviceRGBColorSpace());
+    }
+
+    // Save original render mode in proof_state, and restore in proof_end_text_object()
+    // This is due to poppler's OutputDev::updateRender() actually has no effect, we have to
+    // modify state directly, see proof_begin_string().
+    proof_state->setRender(state->getRender());
+}
+
+void BackgroundRenderer::proof_begin_string(GfxState *state, OutputDev * dev)
+{
+    int render = proof_state->getRender();
+    if (render == 3) // hidden
+        return;
+
+    double lx = state->getFontSize() / 70, ly = lx;
+    tm_transform(state->getTextMat(), lx, ly, true);
+    proof_state->setLineWidth(sqrt(lx * lx + ly * ly));
+
+    static const Color red(1, 0, 0), green(0, 1, 0), blue(0, 0, 1), yellow(1, 1, 0), white(1, 1, 1);
+    Color fc, sc;
+    const Color *pfc, *psc;
+    state->getFillRGB(&fc.rgb);
+    state->getStrokeRGB(&sc.rgb);
+
+    if (render == 0 || render == 2) //has fill
+        pfc = fc.distance(red) >  0.4 ? &red : &green;
+    else
+        pfc = &red;
+
+    if (render == 1 || render == 2) // has stroke
+        psc = sc.distance(blue) >  0.4 ?  &blue : &yellow;
+    else if(render == 0) // fill only
+        psc = &white;
+    else
+        psc = &blue;
+
+    GfxColor gfc, gsc;
+    pfc->get_gfx_color(gfc);
+    psc->get_gfx_color(gsc);
+    proof_state->setFillColor(&gfc);
+    proof_state->setStrokeColor(&gsc);
+
+    if (state->getFillColorSpace()->getMode() != csDeviceRGB)
+        dev->updateFillColorSpace(proof_state.get());
+    if (state->getStrokeColorSpace()->getMode() != csDeviceRGB)
+        dev->updateStrokeColorSpace(proof_state.get());
+
+    dev->updateLineWidth(proof_state.get());
+    dev->updateFillColor(proof_state.get());
+    dev->updateStrokeColor(proof_state.get());
+
+    state->setRender(2); // fill & stroke
+}
+
+void BackgroundRenderer::proof_end_text_object(GfxState *state, OutputDev * dev)
+{
+    state->setRender(proof_state->getRender());
+    dev->updateLineWidth(state);
+    dev->updateFillColorSpace(state);
+    dev->updateStrokeColorSpace(state);
+    dev->updateFillColor(state);
+    dev->updateStrokeColor(state);
+}
+
+void BackgroundRenderer::proof_update_render(GfxState *state, OutputDev * dev)
+{
+    // Save render mode in proof_state in cases it is changed inside a text object
+    proof_state->setRender(state->getRender());
+}
+
+} // namespace pdf2htmlEX
diff --git a/src/BackgroundRenderer/BackgroundRenderer.h b/src/BackgroundRenderer/BackgroundRenderer.h
new file mode 100644
index 0000000..2927484
--- /dev/null
+++ b/src/BackgroundRenderer/BackgroundRenderer.h
@@ -0,0 +1,52 @@
+/*
+ * Background renderer
+ * Render all those things not supported as Image
+ *
+ * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+
+#ifndef BACKGROUND_RENDERER_H__
+#define BACKGROUND_RENDERER_H__
+
+#include <string>
+#include <memory>
+
+class PDFDoc;
+class GfxState;
+class OutputDev;
+
+namespace pdf2htmlEX {
+
+class Param;
+class HTMLRenderer;
+class BackgroundRenderer 
+{
+public:
+    // return nullptr upon failure
+    static std::unique_ptr<BackgroundRenderer> getBackgroundRenderer(const std::string & format, HTMLRenderer * html_renderer, const Param & param);
+    // Return a fallback bg renderer according to param.bg_format.
+    // Currently only svg bg format might need a bitmap fallback.
+    static std::unique_ptr<BackgroundRenderer> getFallbackBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param);
+
+    BackgroundRenderer() {}
+    virtual ~BackgroundRenderer() {}
+
+    virtual void init(PDFDoc * doc) = 0;
+    //return true on success, false otherwise (e.g. need a fallback)
+    virtual bool render_page(PDFDoc * doc, int pageno) = 0;
+    virtual void embed_image(int pageno) = 0;
+
+    // for proof output
+protected:
+    void proof_begin_text_object(GfxState * state, OutputDev * dev);
+    void proof_begin_string(GfxState * state, OutputDev * dev);
+    void proof_end_text_object(GfxState * state, OutputDev * dev);
+    void proof_update_render(GfxState * state, OutputDev * dev);
+private:
+    std::unique_ptr<GfxState> proof_state;
+};
+
+} // namespace pdf2htmlEX
+
+#endif //BACKGROUND_RENDERER_H__
diff --git a/src/BackgroundRenderer/CairoBackgroundRenderer.cc b/src/BackgroundRenderer/CairoBackgroundRenderer.cc
new file mode 100644
index 0000000..1ce6eac
--- /dev/null
+++ b/src/BackgroundRenderer/CairoBackgroundRenderer.cc
@@ -0,0 +1,311 @@
+/*
+ * CairoBackgroundRenderer.cc
+ *
+ * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#include <string>
+#include <fstream>
+
+
+#include "pdf2htmlEX-config.h"
+
+#include "Base64Stream.h"
+
+#if ENABLE_SVG
+
+#include "CairoBackgroundRenderer.h"
+#include "SplashBackgroundRenderer.h"
+
+namespace pdf2htmlEX {
+
+using std::string;
+using std::ifstream;
+using std::ofstream;
+using std::vector;
+using std::unordered_map;
+
+CairoBackgroundRenderer::CairoBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param)
+    : CairoOutputDev()
+    , html_renderer(html_renderer)
+    , param(param)
+    , surface(nullptr)
+{ }
+
+CairoBackgroundRenderer::~CairoBackgroundRenderer()
+{
+    for(auto const& p : bitmaps_ref_count)
+    {
+        if (p.second == 0)
+        {
+            html_renderer->tmp_files.add(this->build_bitmap_path(p.first));
+        }
+    }
+}
+
+void CairoBackgroundRenderer::drawChar(GfxState *state, double x, double y,
+        double dx, double dy,
+        double originX, double originY,
+        CharCode code, int nBytes, Unicode *u, int uLen)
+{
+    // draw characters as image when
+    // - in fallback mode
+    // - OR there is special filling method
+    // - OR using a writing mode font
+    // - OR using a Type 3 font while param.process_type3 is not enabled
+    // - OR the text is used as path
+    if((param.fallback || param.proof)
+        || ( (state->getFont())
+            && ( (state->getFont()->getWMode())
+                 || ((state->getFont()->getType() == fontType3) && (!param.process_type3))
+                 || (state->getRender() >= 4)
+               )
+          )
+      )
+    {
+        CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
+    }
+    // If a char is treated as image, it is not subject to cover test
+    // (see HTMLRenderer::drawString), so don't increase drawn_char_count.
+    else if (param.correct_text_visibility) {
+        if (html_renderer->is_char_covered(drawn_char_count))
+            CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
+        drawn_char_count++;
+    }
+}
+
+void CairoBackgroundRenderer::beginTextObject(GfxState *state)
+{
+    if (param.proof == 2)
+        proof_begin_text_object(state, this);
+    CairoOutputDev::beginTextObject(state);
+}
+
+void CairoBackgroundRenderer::beginString(GfxState *state, GooString * str)
+{
+    if (param.proof == 2)
+        proof_begin_string(state, this);
+    CairoOutputDev::beginString(state, str);
+}
+
+void CairoBackgroundRenderer::endTextObject(GfxState *state)
+{
+    if (param.proof == 2)
+        proof_end_text_object(state, this);
+    CairoOutputDev::endTextObject(state);
+}
+
+void CairoBackgroundRenderer::updateRender(GfxState *state)
+{
+    if (param.proof == 2)
+        proof_update_render(state, this);
+    CairoOutputDev::updateRender(state);
+}
+
+void CairoBackgroundRenderer::init(PDFDoc * doc)
+{
+    startDoc(doc);
+}
+
+static GBool annot_cb(Annot *, void * pflag) {
+    return (*((bool*)pflag)) ? gTrue : gFalse;
+};
+
+bool CairoBackgroundRenderer::render_page(PDFDoc * doc, int pageno)
+{
+    drawn_char_count = 0;
+    double page_width;
+    double page_height;
+    if(param.use_cropbox)
+    {
+        page_width = doc->getPageCropWidth(pageno);
+        page_height = doc->getPageCropHeight(pageno);
+    }
+    else
+    {
+        page_width = doc->getPageMediaWidth(pageno);
+        page_height = doc->getPageMediaHeight(pageno);
+    }
+
+    if (doc->getPageRotate(pageno) == 90 || doc->getPageRotate(pageno) == 270)
+        std::swap(page_height, page_width);
+
+    string fn = (char*)html_renderer->str_fmt("%s/bg%x.svg", (param.embed_image ? param.tmp_dir : param.dest_dir).c_str(), pageno);
+    if(param.embed_image)
+        html_renderer->tmp_files.add(fn);
+
+    surface = cairo_svg_surface_create(fn.c_str(), page_width * param.h_dpi / DEFAULT_DPI, page_height * param.v_dpi / DEFAULT_DPI);
+    cairo_svg_surface_restrict_to_version(surface, CAIRO_SVG_VERSION_1_2);
+    cairo_surface_set_fallback_resolution(surface, param.h_dpi, param.v_dpi);
+
+    cairo_t * cr = cairo_create(surface);
+    setCairo(cr);
+
+    bitmaps_in_current_page.clear();
+
+    bool process_annotation = param.process_annotation;
+    doc->displayPage(this, pageno, param.h_dpi, param.v_dpi,
+            0, 
+            (!(param.use_cropbox)),
+            false, 
+            false,
+            nullptr, nullptr, &annot_cb, &process_annotation);
+
+    setCairo(nullptr);
+    
+    {
+        auto status = cairo_status(cr);
+        cairo_destroy(cr);
+        if(status)
+            throw string("Cairo error: ") + cairo_status_to_string(status);
+    }
+
+    cairo_surface_finish(surface);
+    {
+        auto status = cairo_surface_status(surface);
+        cairo_surface_destroy(surface);
+        surface = nullptr;
+        if(status)
+            throw string("Error in cairo: ") + cairo_status_to_string(status);
+    }
+
+    //check node count in the svg file, fall back to bitmap_renderer if necessary.
+    if (param.svg_node_count_limit >= 0)
+    {
+        int n = 0;
+        char c;
+        ifstream svgfile(fn);
+        //count of '<' in the file should be an approximation of node count.
+        while(svgfile >> c)
+        {
+            if (c == '<')
+                ++n;
+            if (n > param.svg_node_count_limit)
+            {
+                html_renderer->tmp_files.add(fn);
+                return false;
+            }
+        }
+    }
+
+    // the svg file is actually used, so add its bitmaps' ref count.
+    for (auto id : bitmaps_in_current_page)
+        ++bitmaps_ref_count[id];
+
+    return true;
+}
+
+void CairoBackgroundRenderer::embed_image(int pageno)
+{
+    auto & f_page = *(html_renderer->f_curpage);
+    
+    // SVGs introduced by <img> or background-image can't have external resources;
+    // SVGs introduced by <embed> and <object> can, but they are more expensive for browsers.
+    // So we use <img> if the SVG contains no external bitmaps, and use <embed> otherwise.
+    // See also:
+    //   https://developer.mozilla.org/en-US/docs/Web/SVG/SVG_as_an_Image
+    //   http://stackoverflow.com/questions/4476526/do-i-use-img-object-or-embed-for-svg-files
+
+    if (param.svg_embed_bitmap || bitmaps_in_current_page.empty())
+        f_page << "<img";
+    else
+        f_page << "<embed";
+
+    f_page << " class=\"" << CSS::FULL_BACKGROUND_IMAGE_CN
+        << "\" alt=\"\" src=\"";
+
+    if(param.embed_image)
+    {
+        auto path = html_renderer->str_fmt("%s/bg%x.svg", param.tmp_dir.c_str(), pageno);
+        ifstream fin((char*)path, ifstream::binary);
+        if(!fin)
+            throw string("Cannot read background image ") + (char*)path;
+        f_page << "data:image/svg+xml;base64," << Base64Stream(fin);
+    }
+    else
+    {
+        f_page << (char*)html_renderer->str_fmt("bg%x.svg", pageno);
+    }
+    f_page << "\"/>";
+}
+
+string CairoBackgroundRenderer::build_bitmap_path(int id)
+{
+    // "o" for "PDF Object"
+    return string(html_renderer->str_fmt("%s/o%d.jpg", param.dest_dir.c_str(), id));
+}
+// Override CairoOutputDev::setMimeData() and dump bitmaps in SVG to external files.
+void CairoBackgroundRenderer::setMimeData(Stream *str, Object *ref, cairo_surface_t *image)
+{
+    if (param.svg_embed_bitmap)
+    {
+        CairoOutputDev::setMimeData(str, ref, image);
+        return;
+    }
+
+    // TODO dump bitmaps in other formats.
+    if (str->getKind() != strDCT)
+        return;
+
+    // TODO inline image?
+    if (ref == nullptr || !ref->isRef())
+        return;
+
+    // We only dump rgb or gray jpeg without /Decode array.
+    //
+    // Although jpeg support CMYK, PDF readers do color conversion incompatibly with most other
+    // programs (including browsers): other programs invert CMYK color if 'Adobe' marker (app14) presents
+    // in a jpeg file; while PDF readers don't, they solely rely on /Decode array to invert color.
+    // It's a bit complicated to decide whether a CMYK jpeg is safe to dump, so we don't dump at all.
+    // See also:
+    //   JPEG file embedded in PDF (CMYK) https://forums.adobe.com/thread/975777
+    //   http://stackoverflow.com/questions/3123574/how-to-convert-from-cmyk-to-rgb-in-java-correctly
+    //
+    // In PDF, jpeg stream objects can also specify other color spaces like DeviceN and Separation,
+    // It is also not safe to dump them directly.
+    Object obj;
+    str->getDict()->lookup("ColorSpace", &obj);
+    if (!obj.isName() || (strcmp(obj.getName(), "DeviceRGB") && strcmp(obj.getName(), "DeviceGray")) )
+    {
+        obj.free();
+        return;
+    }
+    obj.free();
+    str->getDict()->lookup("Decode", &obj);
+    if (obj.isArray())
+    {
+        obj.free();
+        return;
+    }
+    obj.free();
+
+    int imgId = ref->getRef().num;
+    auto uri = strdup((char*) html_renderer->str_fmt("o%d.jpg", imgId));
+    auto st = cairo_surface_set_mime_data(image, CAIRO_MIME_TYPE_URI,
+        (unsigned char*) uri, strlen(uri), free, uri);
+    if (st)
+    {
+        free(uri);
+        return;
+    }
+    bitmaps_in_current_page.push_back(imgId);
+
+    if(bitmaps_ref_count.find(imgId) != bitmaps_ref_count.end())
+        return;
+
+    bitmaps_ref_count[imgId] = 0;
+
+    char *strBuffer;
+    int len;
+    if (getStreamData(str->getNextStream(), &strBuffer, &len))
+    {
+        ofstream imgfile(build_bitmap_path(imgId), ofstream::binary);
+        imgfile.write(strBuffer, len);
+        free(strBuffer);
+    }
+}
+
+} // namespace pdf2htmlEX
+
+#endif // ENABLE_SVG
+
diff --git a/src/BackgroundRenderer/CairoBackgroundRenderer.h b/src/BackgroundRenderer/CairoBackgroundRenderer.h
new file mode 100644
index 0000000..4ed9c86
--- /dev/null
+++ b/src/BackgroundRenderer/CairoBackgroundRenderer.h
@@ -0,0 +1,75 @@
+/*
+ * Cairo Background renderer
+ * Render all those things not supported as Image, with Cairo
+ *
+ * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+
+#ifndef CAIRO_BACKGROUND_RENDERER_H__
+#define CAIRO_BACKGROUND_RENDERER_H__
+
+#include <CairoOutputDev.h>
+#include <cairo.h>
+#include <cairo-svg.h>
+#include <unordered_map>
+#include <vector>
+#include <string>
+
+#include "pdf2htmlEX-config.h"
+
+#include "Param.h"
+#include "HTMLRenderer/HTMLRenderer.h"
+
+namespace pdf2htmlEX {
+
+// Based on BackgroundRenderer from poppler
+class CairoBackgroundRenderer : public BackgroundRenderer, CairoOutputDev 
+{
+public:
+  CairoBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param);
+
+  virtual ~CairoBackgroundRenderer();
+
+  virtual void init(PDFDoc * doc);
+  virtual bool render_page(PDFDoc * doc, int pageno);
+  virtual void embed_image(int pageno);
+
+  // Does this device use beginType3Char/endType3Char?  Otherwise,
+  // text in Type 3 fonts will be drawn with drawChar/drawString.
+  virtual GBool interpretType3Chars() { return !param.process_type3; }
+
+  virtual void drawChar(GfxState *state, double x, double y,
+      double dx, double dy,
+      double originX, double originY,
+      CharCode code, int nBytes, Unicode *u, int uLen);
+
+  //for proof
+  void beginTextObject(GfxState *state);
+  void beginString(GfxState *state, GooString * str);
+  void endTextObject(GfxState *state);
+  void updateRender(GfxState *state);
+
+protected:
+  virtual void setMimeData(Stream *str, Object *ref, cairo_surface_t *image);
+
+protected:
+  HTMLRenderer * html_renderer;
+  const Param & param;
+  cairo_surface_t * surface;
+
+private:
+  // convert bitmap stream id to bitmap file name. No pageno prefix,
+  // because a bitmap may be shared by multiple pages.
+  std::string build_bitmap_path(int id);
+  // map<id_of_bitmap_stream, usage_count_in_all_svgs>
+  // note: if a svg bg fallbacks to bitmap bg, its bitmaps are not taken into account.
+  std::unordered_map<int, int> bitmaps_ref_count;
+  // id of bitmaps' stream used by current page
+  std::vector<int> bitmaps_in_current_page;
+  int drawn_char_count;
+};
+
+}
+
+#endif //CAIRO_BACKGROUND_RENDERER_H__
diff --git a/src/BackgroundRenderer/SplashBackgroundRenderer.cc b/src/BackgroundRenderer/SplashBackgroundRenderer.cc
new file mode 100644
index 0000000..55b5322
--- /dev/null
+++ b/src/BackgroundRenderer/SplashBackgroundRenderer.cc
@@ -0,0 +1,261 @@
+/*
+ * SplashBackgroundRenderer.cc
+ *
+ * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#include <fstream>
+#include <vector>
+#include <memory>
+
+#include <poppler-config.h>
+#include <PDFDoc.h>
+#include <goo/ImgWriter.h>
+#include <goo/PNGWriter.h>
+#include <goo/JpegWriter.h>
+
+#include "Base64Stream.h"
+#include "util/const.h"
+
+#include "SplashBackgroundRenderer.h"
+
+namespace pdf2htmlEX {
+
+using std::string;
+using std::ifstream;
+using std::vector;
+using std::unique_ptr;
+
+const SplashColor SplashBackgroundRenderer::white = {255,255,255};
+
+SplashBackgroundRenderer::SplashBackgroundRenderer(const string & imgFormat, HTMLRenderer * html_renderer, const Param & param)
+    : SplashOutputDev(splashModeRGB8, 4, gFalse, (SplashColorPtr)(&white))
+    , html_renderer(html_renderer)
+    , param(param)
+    , format(imgFormat)
+{
+    bool supported = false;
+#ifdef ENABLE_LIBPNG
+    if (format.empty())
+        format = "png";
+    supported = supported || format == "png";
+#endif
+#ifdef ENABLE_LIBJPEG
+    if (format.empty())
+        format = "jpg";
+    supported = supported || format == "jpg";
+#endif
+    if (!supported)
+    {
+        throw string("Image format not supported: ") + format;
+    }
+}
+
+/*
+ * SplashOutputDev::startPage would paint the whole page with the background color
+ * And thus have modified region set to the whole page area
+ * We do not want that.
+ */
+void SplashBackgroundRenderer::startPage(int pageNum, GfxState *state, XRef *xrefA)
+{
+    SplashOutputDev::startPage(pageNum, state, xrefA);
+    clearModRegion();
+}
+
+void SplashBackgroundRenderer::drawChar(GfxState *state, double x, double y,
+  double dx, double dy,
+  double originX, double originY,
+  CharCode code, int nBytes, Unicode *u, int uLen)
+{
+    // draw characters as image when
+    // - in fallback mode
+    // - OR there is special filling method
+    // - OR using a writing mode font
+    // - OR using a Type 3 font while param.process_type3 is not enabled
+    // - OR the text is used as path
+    if((param.fallback || param.proof)
+       || ( (state->getFont()) 
+            && ( (state->getFont()->getWMode())
+                 || ((state->getFont()->getType() == fontType3) && (!param.process_type3))
+                 || (state->getRender() >= 4)
+               )
+          )
+      )
+    {
+        SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
+    }
+    // If a char is treated as image, it is not subject to cover test
+    // (see HTMLRenderer::drawString), so don't increase drawn_char_count.
+    else if (param.correct_text_visibility) {
+        if (html_renderer->is_char_covered(drawn_char_count))
+            SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen);
+        drawn_char_count++;
+    }
+}
+
+void SplashBackgroundRenderer::beginTextObject(GfxState *state)
+{
+    if (param.proof == 2)
+        proof_begin_text_object(state, this);
+    SplashOutputDev::beginTextObject(state);
+}
+
+void SplashBackgroundRenderer::beginString(GfxState *state, GooString * str)
+{
+    if (param.proof == 2)
+        proof_begin_string(state, this);
+    SplashOutputDev::beginString(state, str);
+}
+
+void SplashBackgroundRenderer::endTextObject(GfxState *state)
+{
+    if (param.proof == 2)
+        proof_end_text_object(state, this);
+    SplashOutputDev::endTextObject(state);
+}
+
+void SplashBackgroundRenderer::updateRender(GfxState *state)
+{
+    if (param.proof == 2)
+        proof_update_render(state, this);
+    SplashOutputDev::updateRender(state);
+}
+
+void SplashBackgroundRenderer::init(PDFDoc * doc)
+{
+    startDoc(doc);
+}
+
+static GBool annot_cb(Annot *, void * pflag) {
+    return (*((bool*)pflag)) ? gTrue : gFalse;
+};
+
+bool SplashBackgroundRenderer::render_page(PDFDoc * doc, int pageno)
+{
+    drawn_char_count = 0;
+    bool process_annotation = param.process_annotation;
+    doc->displayPage(this, pageno, param.h_dpi, param.v_dpi,
+            0, 
+            (!(param.use_cropbox)),
+            false, false,
+            nullptr, nullptr, &annot_cb, &process_annotation);
+    return true;
+}
+
+void SplashBackgroundRenderer::embed_image(int pageno)
+{
+    // xmin->xmax is top->bottom
+    int xmin, xmax, ymin, ymax;
+    getModRegion(&xmin, &ymin, &xmax, &ymax);
+
+    // dump the background image only when it is not empty
+    if((xmin <= xmax) && (ymin <= ymax))
+    {
+        {
+            auto fn = html_renderer->str_fmt("%s/bg%x.%s", (param.embed_image ? param.tmp_dir : param.dest_dir).c_str(), pageno, format.c_str());
+            if(param.embed_image)
+                html_renderer->tmp_files.add((char*)fn);
+
+            dump_image((char*)fn, xmin, ymin, xmax, ymax);
+        }
+
+        double h_scale = html_renderer->text_zoom_factor() * DEFAULT_DPI / param.h_dpi;
+        double v_scale = html_renderer->text_zoom_factor() * DEFAULT_DPI / param.v_dpi;
+
+        auto & f_page = *(html_renderer->f_curpage);
+        auto & all_manager = html_renderer->all_manager;
+        
+        f_page << "<img class=\"" << CSS::BACKGROUND_IMAGE_CN 
+            << " " << CSS::LEFT_CN      << all_manager.left.install(((double)xmin) * h_scale)
+            << " " << CSS::BOTTOM_CN    << all_manager.bottom.install(((double)getBitmapHeight() - 1 - ymax) * v_scale)
+            << " " << CSS::WIDTH_CN     << all_manager.width.install(((double)(xmax - xmin + 1)) * h_scale)
+            << " " << CSS::HEIGHT_CN    << all_manager.height.install(((double)(ymax - ymin + 1)) * v_scale)
+            << "\" alt=\"\" src=\"";
+
+        if(param.embed_image)
+        {
+            auto path = html_renderer->str_fmt("%s/bg%x.%s", param.tmp_dir.c_str(), pageno, format.c_str());
+            ifstream fin((char*)path, ifstream::binary);
+            if(!fin)
+                throw string("Cannot read background image ") + (char*)path;
+
+            auto iter = FORMAT_MIME_TYPE_MAP.find(format);
+            if(iter == FORMAT_MIME_TYPE_MAP.end())
+                throw string("Image format not supported: ") + format;
+
+            string mime_type = iter->second;
+            f_page << "data:" << mime_type << ";base64," << Base64Stream(fin);
+        }
+        else
+        {
+            f_page << (char*)html_renderer->str_fmt("bg%x.%s", pageno, format.c_str());
+        }
+        f_page << "\"/>";
+    }
+}
+
+// There might be mem leak when exception is thrown !
+void SplashBackgroundRenderer::dump_image(const char * filename, int x1, int y1, int x2, int y2)
+{
+    int width = x2 - x1 + 1;
+    int height = y2 - y1 + 1;
+    if((width <= 0) || (height <= 0))
+        throw "Bad metric for background image";
+
+    FILE * f = fopen(filename, "wb");
+    if(!f)
+        throw string("Cannot open file for background image " ) + filename;
+
+    // use unique_ptr to auto delete the object upon exception
+    unique_ptr<ImgWriter> writer;
+
+    if(false) { }
+#ifdef ENABLE_LIBPNG
+    else if(format == "png")
+    {
+        writer = unique_ptr<ImgWriter>(new PNGWriter);
+    }
+#endif
+#ifdef ENABLE_LIBJPEG
+    else if(format == "jpg")
+    {
+        writer = unique_ptr<ImgWriter>(new JpegWriter);
+    }
+#endif
+    else
+    {
+        throw string("Image format not supported: ") + format;
+    }
+
+    if(!writer->init(f, width, height, param.h_dpi, param.v_dpi))
+        throw "Cannot initialize image writer";
+        
+    auto * bitmap = getBitmap();
+    assert(bitmap->getMode() == splashModeRGB8);
+
+    SplashColorPtr data = bitmap->getDataPtr();
+    int row_size = bitmap->getRowSize();
+
+    vector<unsigned char*> pointers;
+    pointers.reserve(height);
+    SplashColorPtr p = data + y1 * row_size + x1 * 3;
+    for(int i = 0; i < height; ++i)
+    {
+        pointers.push_back(p);
+        p += row_size;
+    }
+    
+    if(!writer->writePointers(pointers.data(), height)) 
+    {
+        throw "Cannot write background image";
+    }
+
+    if(!writer->close())
+    {
+        throw "Cannot finish background image";
+    }
+
+    fclose(f);
+}
+
+} // namespace pdf2htmlEX
diff --git a/src/BackgroundRenderer/SplashBackgroundRenderer.h b/src/BackgroundRenderer/SplashBackgroundRenderer.h
new file mode 100644
index 0000000..067de28
--- /dev/null
+++ b/src/BackgroundRenderer/SplashBackgroundRenderer.h
@@ -0,0 +1,65 @@
+/*
+ * Splash Background renderer
+ * Render all those things not supported as Image, with Splash
+ *
+ * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+
+#ifndef SPLASH_BACKGROUND_RENDERER_H__
+#define SPLASH_BACKGROUND_RENDERER_H__
+
+#include <string>
+
+#include <splash/SplashBitmap.h>
+#include <SplashOutputDev.h>
+
+#include "pdf2htmlEX-config.h"
+
+#include "Param.h"
+#include "HTMLRenderer/HTMLRenderer.h"
+
+namespace pdf2htmlEX {
+
+// Based on BackgroundRenderer from poppler
+class SplashBackgroundRenderer : public BackgroundRenderer, SplashOutputDev 
+{
+public:
+  static const SplashColor white;
+  //format: "png" or "jpg", or "" for a default format
+  SplashBackgroundRenderer(const std::string & format, HTMLRenderer * html_renderer, const Param & param);
+
+  virtual ~SplashBackgroundRenderer() { }
+
+  virtual void init(PDFDoc * doc);
+  virtual bool render_page(PDFDoc * doc, int pageno);
+  virtual void embed_image(int pageno);
+
+  // Does this device use beginType3Char/endType3Char?  Otherwise,
+  // text in Type 3 fonts will be drawn with drawChar/drawString.
+  virtual GBool interpretType3Chars() { return !param.process_type3; }
+
+  virtual void startPage(int pageNum, GfxState *state, XRef *xrefA);
+  
+  virtual void drawChar(GfxState *state, double x, double y,
+      double dx, double dy,
+      double originX, double originY,
+      CharCode code, int nBytes, Unicode *u, int uLen);
+
+  //for proof
+  void beginTextObject(GfxState *state);
+  void beginString(GfxState *state, GooString * str);
+  void endTextObject(GfxState *state);
+  void updateRender(GfxState *state);
+
+protected:
+  void dump_image(const char * filename, int x1, int y1, int x2, int y2);
+  HTMLRenderer * html_renderer;
+  const Param & param;
+  std::string format;
+  int drawn_char_count;
+};
+
+} // namespace pdf2htmlEX
+
+#endif // SPLASH_BACKGROUND_RENDERER_H__
diff --git a/src/Base64Stream.cc b/src/Base64Stream.cc
new file mode 100644
index 0000000..5d02aae
--- /dev/null
+++ b/src/Base64Stream.cc
@@ -0,0 +1,42 @@
+#include "Base64Stream.h"
+
+namespace pdf2htmlEX {
+
+using std::ostream;
+
+ostream & Base64Stream::dumpto(ostream & out)
+{
+    unsigned char buf[3];
+    while(in->read((char*)buf, 3))
+    {
+        out << base64_encoding[(buf[0] & 0xfc)>>2]
+            << base64_encoding[((buf[0] & 0x03)<<4) | ((buf[1] & 0xf0)>>4)]
+            << base64_encoding[((buf[1] & 0x0f)<<2) | ((buf[2] & 0xc0)>>6)]
+            << base64_encoding[(buf[2] & 0x3f)];
+    } 
+    auto cnt = in->gcount();
+    if(cnt > 0)
+    {
+        for(int i = cnt; i < 3; ++i)
+            buf[i] = 0;
+
+        out << base64_encoding[(buf[0] & 0xfc)>>2]
+            << base64_encoding[((buf[0] & 0x03)<<4) | ((buf[1] & 0xf0)>>4)];
+
+        if(cnt > 1)
+        {
+            out << base64_encoding[(buf[1] & 0x0f)<<2];
+        }
+        else
+        {
+            out <<  '=';
+        }
+        out << '=';
+    }
+
+    return out;
+}
+
+const char * Base64Stream::base64_encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+} //namespace pdf2htmlEX
diff --git a/src/Base64Stream.h b/src/Base64Stream.h
new file mode 100644
index 0000000..759515f
--- /dev/null
+++ b/src/Base64Stream.h
@@ -0,0 +1,34 @@
+/*
+ * Base64 Encoding
+ *
+ * by WangLu
+ * 2012.11.29
+ */
+
+#ifndef BASE64STREAM_H__
+#define BASE64STREAM_H__
+
+#include <iostream>
+
+namespace pdf2htmlEX {
+
+class Base64Stream
+{
+public:
+    Base64Stream(std::istream & in) : in(&in) { }
+
+    std::ostream & dumpto(std::ostream & out);
+
+private:
+    std::istream * in;
+    static const char * base64_encoding;
+};
+
+inline 
+std::ostream & operator << (std::ostream & out, Base64Stream bs)
+{
+    return bs.dumpto(out);
+}
+
+} //namespace pdf2htmlEX
+#endif //BASE64STREAM_H__
diff --git a/src/Color.cc b/src/Color.cc
new file mode 100644
index 0000000..6a344e5
--- /dev/null
+++ b/src/Color.cc
@@ -0,0 +1,51 @@
+#include <cmath>
+
+#include "Color.h"
+
+#include "util/misc.h"
+
+namespace pdf2htmlEX {
+
+using std::ostream;
+
+Color::Color()
+{
+    memset(this, 0, sizeof(Color));
+}
+
+Color::Color(double r, double g, double b, bool transparent)
+    :transparent(transparent)
+{
+    rgb.r = (GfxColorComp)(r * gfxColorComp1);
+    rgb.g = (GfxColorComp)(g * gfxColorComp1);
+    rgb.b = (GfxColorComp)(b * gfxColorComp1);
+}
+
+Color::Color(const GfxRGB& rgb)
+    :transparent(false), rgb(rgb) { }
+
+ostream & operator << (ostream & out, const Color & color)
+{
+    if(color.transparent)
+        out << "transparent";
+    else
+        out << color.rgb;
+    return out;
+}
+
+void Color::get_gfx_color(GfxColor & gc) const
+{
+    gc.c[0] = rgb.r;
+    gc.c[1] = rgb.g;
+    gc.c[2] = rgb.b;
+}
+
+double Color::distance(const Color & other) const
+{
+    double dr = (double)rgb.r - other.rgb.r,
+            dg = (double)rgb.g - other.rgb.g,
+            db = (double)rgb.b - other.rgb.b;
+    return sqrt((dr * dr + dg * dg + db * db) / (3.0 * gfxColorComp1 * gfxColorComp1));
+}
+
+} // namespace pdf2htmlEX
diff --git a/src/Color.h b/src/Color.h
new file mode 100644
index 0000000..a2d2415
--- /dev/null
+++ b/src/Color.h
@@ -0,0 +1,38 @@
+/*
+ * Header file for Color 
+ * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#ifndef COLOR_H__
+#define COLOR_H__
+
+#include <ostream>
+
+#include <GfxState.h>
+
+namespace pdf2htmlEX {
+
+struct Color
+{
+    bool transparent;
+    GfxRGB rgb;
+    Color();
+    Color(double r, double g, double b, bool transparent = false);
+    Color(const GfxRGB& rgb);
+    bool operator == (const Color & c) const {
+        if(transparent != c.transparent)
+            return false;
+        if(transparent)
+            return true;
+        return ((rgb.r == c.rgb.r) && (rgb.g == c.rgb.g) && (rgb.b == c.rgb.b));
+    }
+    void get_gfx_color(GfxColor & gc) const;
+    // Color distance, [0,1].
+    double distance(const Color & other) const;
+};
+
+std::ostream & operator << (std::ostream & out, const Color & color);
+
+} // namespace pdf2htmlEX
+
+#endif // COLOR_H__
diff --git a/src/CoveredTextDetector.cc b/src/CoveredTextDetector.cc
new file mode 100644
index 0000000..e109b3f
--- /dev/null
+++ b/src/CoveredTextDetector.cc
@@ -0,0 +1,51 @@
+/*
+ * CoveredTextDetector.cc
+ *
+ *  Created on: 2014-6-14
+ *      Author: duanyao
+ */
+
+#include "CoveredTextDetector.h"
+
+#include "util/math.h"
+
+namespace pdf2htmlEX {
+
+void CoveredTextDetector::reset()
+{
+    char_bboxes.clear();
+    chars_covered.clear();
+}
+
+void CoveredTextDetector::add_char_bbox(double * bbox)
+{
+    char_bboxes.insert(char_bboxes.end(), bbox, bbox + 4);
+    chars_covered.push_back(false);
+}
+
+void CoveredTextDetector::add_char_bbox_clipped(double * bbox, bool patially)
+{
+    char_bboxes.insert(char_bboxes.end(), bbox, bbox + 4);
+    chars_covered.push_back(true);
+    if (patially)
+        add_non_char_bbox(bbox, chars_covered.size() - 1);
+}
+
+void CoveredTextDetector::add_non_char_bbox(double * bbox, int index)
+{
+    if (index < 0)
+        index = chars_covered.size();
+    for (int i = 0; i < index; i++)
+    {
+        if (chars_covered[i])
+            continue;
+        double * cbbox = &char_bboxes[i * 4];
+        if (bbox_intersect(cbbox, bbox))
+        {
+            chars_covered[i] = true;
+            add_non_char_bbox(cbbox, i);
+        }
+    }
+}
+
+}
diff --git a/src/CoveredTextDetector.h b/src/CoveredTextDetector.h
new file mode 100644
index 0000000..bee6c17
--- /dev/null
+++ b/src/CoveredTextDetector.h
@@ -0,0 +1,61 @@
+/*
+ * CoveredTextDetector.h
+ *
+ *  Created on: 2014-6-14
+ *      Author: duanyao
+ */
+
+#ifndef COVEREDTEXTDETECTOR_H__
+#define COVEREDTEXTDETECTOR_H__
+
+#include <vector>
+
+namespace pdf2htmlEX {
+
+/**
+ * Detect characters that are covered by non-char graphics on a page.
+ */
+class CoveredTextDetector
+{
+public:
+
+    /**
+     * Reset to initial state. Should be called when start drawing a page.
+     */
+    void reset();
+
+    /**
+     * Add a drawn character's bounding box.
+     * @param bbox (x0, y0, x1, y1)
+     */
+    void add_char_bbox(double * bbox);
+
+    void add_char_bbox_clipped(double * bbox, bool patially);
+
+    /**
+     * Add a drawn non-char graphics' bounding box.
+     * If it intersects any previously drawn char's bbox, the char is marked as covered
+     * and treated as an non-char.
+     * @param bbox (x0, y0, x1, y1)
+     * @param index this graphics' drawing order: assume it is drawn after (index-1)th
+     *   char. -1 means after the last char.
+     */
+    void add_non_char_bbox(double * bbox, int index = -1);
+
+    /**
+     * An array of flags indicating whether a char is covered by any non-char graphics.
+     * Index by the order that these chars are added.
+     * This vector grows as add_char_bbox() is called, so its size is the count
+     * of currently drawn chars.
+     */
+    const std::vector<bool> & get_chars_covered() { return chars_covered; }
+
+private:
+    std::vector<bool> chars_covered;
+    // x00, y00, x01, y01; x10, y10, x11, y11;...
+    std::vector<double> char_bboxes;
+};
+
+}
+
+#endif /* COVEREDTEXTDETECTOR_H__ */
diff --git a/src/DrawingTracer.cc b/src/DrawingTracer.cc
new file mode 100644
index 0000000..ffabad0
--- /dev/null
+++ b/src/DrawingTracer.cc
@@ -0,0 +1,400 @@
+/*
+ * DrawingTracer.cc
+ *
+ *  Created on: 2014-6-15
+ *      Author: duanyao
+ */
+
+#include "GfxFont.h"
+
+#include "util/math.h"
+#include "DrawingTracer.h"
+
+#if !ENABLE_SVG
+#warning "Cairo is disabled because ENABLE_SVG is off, --correct-text-visibility has limited functionality."
+#endif
+
+static constexpr bool DT_DEBUG = false;
+
+namespace pdf2htmlEX
+{
+
+DrawingTracer::DrawingTracer(const Param & param): param(param)
+#if ENABLE_SVG
+, cairo(nullptr)
+#endif
+{
+}
+
+DrawingTracer::~DrawingTracer()
+{
+    finish();
+}
+
+void DrawingTracer::reset(GfxState *state)
+{
+    if (!param.correct_text_visibility)
+        return;
+    finish();
+
+#if ENABLE_SVG
+    // pbox is defined in device space, which is affected by zooming;
+    // We want to trace in page space which is stable, so invert pbox by ctm.
+    double pbox[] { 0, 0, state->getPageWidth(), state->getPageHeight() };
+    Matrix ctm, ictm;
+    state->getCTM(&ctm);
+    ctm.invertTo(&ictm);
+    tm_transform_bbox(ictm.m, pbox);
+    cairo_rectangle_t page_box { pbox[0], pbox[1], pbox[2] - pbox[0], pbox[3] - pbox[1] };
+    cairo_surface_t * surface = cairo_recording_surface_create(CAIRO_CONTENT_COLOR_ALPHA, &page_box);
+    cairo = cairo_create(surface);
+    if (DT_DEBUG)
+        printf("DrawingTracer::reset:page bbox:[%f,%f,%f,%f]\n",pbox[0], pbox[1], pbox[2], pbox[3]);
+#endif
+}
+
+void DrawingTracer::finish()
+{
+#if ENABLE_SVG
+    if (cairo)
+    {
+        cairo_destroy(cairo);
+        cairo = nullptr;
+    }
+#endif
+}
+
+// Poppler won't inform us its initial CTM, and the initial CTM is affected by zoom level.
+// OutputDev::clip() may be called before OutputDev::updateCTM(), so we can't rely on GfxState::getCTM(),
+// and should trace ctm changes ourself (via cairo).
+void DrawingTracer::update_ctm(GfxState *state, double m11, double m12, double m21, double m22, double m31, double m32)
+{
+    if (!param.correct_text_visibility)
+        return;
+
+#if ENABLE_SVG
+    cairo_matrix_t matrix;
+    matrix.xx = m11;
+    matrix.yx = m12;
+    matrix.xy = m21;
+    matrix.yy = m22;
+    matrix.x0 = m31;
+    matrix.y0 = m32;
+    cairo_transform(cairo, &matrix);
+
+    if (DT_DEBUG)
+    {
+        cairo_matrix_t mat;
+        cairo_get_matrix(cairo, &mat);
+        printf("DrawingTracer::update_ctm:ctm:[%f,%f,%f,%f,%f,%f]\n", mat.xx, mat.yx, mat.xy, mat.yy, mat.x0, mat.y0);
+    }
+#endif
+}
+
+void DrawingTracer::clip(GfxState * state, bool even_odd)
+{
+    if (!param.correct_text_visibility)
+        return;
+#if ENABLE_SVG
+    do_path(state, state->getPath());
+    cairo_set_fill_rule(cairo, even_odd? CAIRO_FILL_RULE_EVEN_ODD : CAIRO_FILL_RULE_WINDING);
+    cairo_clip (cairo);
+
+    if (DT_DEBUG)
+    {
+        double cbox[4];
+        cairo_clip_extents(cairo, cbox, cbox + 1, cbox + 2, cbox + 3);
+        printf("DrawingTracer::clip:extents:[%f,%f,%f,%f]\n", cbox[0],cbox[1],cbox[2],cbox[3]);
+    }
+#endif
+}
+
+void DrawingTracer::clip_to_stroke_path(GfxState * state)
+{
+    if (!param.correct_text_visibility)
+        return;
+    // TODO cairo_stroke_to_path() ?
+}
+
+void DrawingTracer::save()
+{
+    if (!param.correct_text_visibility)
+        return;
+#if ENABLE_SVG
+    cairo_save(cairo);
+    if (DT_DEBUG)
+        printf("DrawingTracer::save\n");
+#endif
+}
+void DrawingTracer::restore()
+{
+    if (!param.correct_text_visibility)
+        return;
+#if ENABLE_SVG
+    cairo_restore(cairo);
+    if (DT_DEBUG)
+        printf("DrawingTracer::restore\n");
+#endif
+}
+
+void DrawingTracer::do_path(GfxState * state, GfxPath * path)
+{
+#if ENABLE_SVG
+    //copy from CairoOutputDev::doPath
+    GfxSubpath *subpath;
+    int i, j;
+    double x, y;
+    cairo_new_path(cairo);
+    if (DT_DEBUG)
+        printf("DrawingTracer::do_path:new_path\n");
+    for (i = 0; i < path->getNumSubpaths(); ++i) {
+        subpath = path->getSubpath(i);
+        if (subpath->getNumPoints() > 0) {
+            x = subpath->getX(0);
+            y = subpath->getY(0);
+            cairo_move_to(cairo, x, y);
+            if (DT_DEBUG)
+                printf("DrawingTracer::do_path:move_to[%f,%f]\n",x,y);
+            j = 1;
+            while (j < subpath->getNumPoints()) {
+                if (subpath->getCurve(j)) {
+                    x = subpath->getX(j+2);
+                    y = subpath->getY(j+2);
+                    cairo_curve_to(cairo,
+                        subpath->getX(j), subpath->getY(j),
+                        subpath->getX(j+1), subpath->getY(j+1),
+                        x, y);
+                    if (DT_DEBUG)
+                        printf("DrawingTracer::do_path:curve_to[%f,%f]\n",x,y);
+                    j += 3;
+                } else {
+                    x = subpath->getX(j);
+                    y = subpath->getY(j);
+                    cairo_line_to(cairo, x, y);
+                    if (DT_DEBUG)
+                        printf("DrawingTracer::do_path:line_to[%f,%f]\n",x,y);
+                    ++j;
+                }
+            }
+            if (subpath->isClosed()) {
+                cairo_close_path (cairo);
+                if (DT_DEBUG)
+                    printf("DrawingTracer::do_path:close\n");
+            }
+        }
+    }
+#endif
+}
+
+void DrawingTracer::stroke(GfxState * state)
+{
+#if ENABLE_SVG
+    if (!param.correct_text_visibility)
+        return;
+
+    if (DT_DEBUG)
+        printf("DrawingTracer::stroke\n");
+
+    cairo_set_line_width(cairo, state->getLineWidth());
+
+    // GfxPath is broken into steps, each step makes up a cairo path and its bbox is used for covering test.
+    // TODO
+    // 1. path steps that are not vertical or horizontal lines may still falsely "cover" many chars,
+    // can we slice those steps further?
+    // 2. if the line width is small, can we just ignore the path?
+    // 3. line join feature can't be retained. We use line-cap-square to minimize the problem that
+    //   some chars actually covered by a line join are missed. However chars covered by a acute angle
+    //   with line-join-miter may be still recognized as not covered.
+    cairo_set_line_cap(cairo, CAIRO_LINE_CAP_SQUARE);
+    GfxPath * path = state->getPath();
+    for (int i = 0; i < path->getNumSubpaths(); ++i) {
+        GfxSubpath * subpath = path->getSubpath(i);
+        if (subpath->getNumPoints() <= 0)
+            continue;
+        double x = subpath->getX(0);
+        double y = subpath->getY(0);
+        //p: loop cursor; j: next point index
+        int p =1, j = 1;
+        int n = subpath->getNumPoints();
+        while (p <= n) {
+            cairo_new_path(cairo);
+            cairo_move_to(cairo, x, y);
+            if (subpath->getCurve(j)) {
+                x = subpath->getX(j+2);
+                y = subpath->getY(j+2);
+                cairo_curve_to(cairo,
+                    subpath->getX(j), subpath->getY(j),
+                    subpath->getX(j+1), subpath->getY(j+1),
+                    x, y);
+                p += 3;
+            } else {
+                x = subpath->getX(j);
+                y = subpath->getY(j);
+                cairo_line_to(cairo, x, y);
+                ++p;
+            }
+
+            if (DT_DEBUG)
+                printf("DrawingTracer::stroke:new box:\n");
+            double sbox[4];
+            cairo_stroke_extents(cairo, sbox, sbox + 1, sbox + 2, sbox + 3);
+            if (sbox[0] != sbox[2] && sbox[1] != sbox[3])
+                draw_non_char_bbox(state, sbox);
+            else if (DT_DEBUG)
+                printf("DrawingTracer::stroke:zero box!\n");
+
+            if (p == n)
+            {
+                if (subpath->isClosed())
+                    j = 0; // if sub path is closed, go back to starting point
+                else
+                    break;
+            }
+            else
+                j = p;
+        }
+    }
+#endif
+}
+
+void DrawingTracer::fill(GfxState * state, bool even_odd)
+{
+    if (!param.correct_text_visibility)
+        return;
+
+#if ENABLE_SVG
+    do_path(state, state->getPath());
+    //cairo_fill_extents don't take fill rule into account.
+    //cairo_set_fill_rule (cairo, even_odd? CAIRO_FILL_RULE_EVEN_ODD : CAIRO_FILL_RULE_WINDING);
+    double fbox[4];
+    cairo_fill_extents(cairo, fbox, fbox + 1, fbox + 2, fbox + 3);
+    draw_non_char_bbox(state, fbox);
+#endif
+}
+
+void DrawingTracer::draw_non_char_bbox(GfxState * state, double * bbox)
+{
+#if ENABLE_SVG
+    double cbox[4];
+    cairo_clip_extents(cairo, cbox, cbox + 1, cbox + 2, cbox + 3);
+    if(bbox_intersect(cbox, bbox, bbox))
+#endif
+    {
+        transform_bbox_by_ctm(bbox, state);
+        if (DT_DEBUG)
+            printf("DrawingTracer::draw_non_char_bbox:[%f,%f,%f,%f]\n", bbox[0],bbox[1],bbox[2],bbox[3]);
+        if (on_non_char_drawn)
+            on_non_char_drawn(bbox);
+    }
+}
+
+void DrawingTracer::draw_char_bbox(GfxState * state, double * bbox)
+{
+#if ENABLE_SVG
+    // Note: even if 4 corners of the char are all in or all out of the clip area,
+    // it could still be partially clipped.
+    // TODO better solution?
+    int pt_in = 0;
+    if (cairo_in_clip(cairo, bbox[0], bbox[1]))
+        ++pt_in;
+    if (cairo_in_clip(cairo, bbox[2], bbox[3]))
+        ++pt_in;
+    if (cairo_in_clip(cairo, bbox[2], bbox[1]))
+        ++pt_in;
+    if (cairo_in_clip(cairo, bbox[0], bbox[3]))
+        ++pt_in;
+
+    if (pt_in == 0)
+    {
+        transform_bbox_by_ctm(bbox);
+        if(on_char_clipped)
+            on_char_clipped(bbox, false);
+    }
+    else
+    {
+        if (pt_in < 4)
+        {
+            double cbox[4];
+            cairo_clip_extents(cairo, cbox, cbox + 1, cbox + 2, cbox + 3);
+            bbox_intersect(cbox, bbox, bbox);
+        }
+        transform_bbox_by_ctm(bbox);
+        if (pt_in < 4)
+        {
+            if(on_char_clipped)
+                on_char_clipped(bbox, true);
+        }
+        else
+        {
+            if (on_char_drawn)
+                on_char_drawn(bbox);
+        }
+    }
+#else
+    transform_bbox_by_ctm(bbox, state);
+    if (on_char_drawn)
+        on_char_drawn(bbox);
+#endif
+    if (DT_DEBUG)
+        printf("DrawingTracer::draw_char_bbox:[%f,%f,%f,%f]\n",bbox[0],bbox[1],bbox[2],bbox[3]);
+}
+
+void DrawingTracer::draw_image(GfxState *state)
+{
+    if (!param.correct_text_visibility)
+        return;
+    double bbox[4] {0, 0, 1, 1};
+    draw_non_char_bbox(state, bbox);
+}
+
+void DrawingTracer::draw_char(GfxState *state, double x, double y, double ax, double ay)
+{
+    if (!param.correct_text_visibility)
+        return;
+
+    Matrix tm, itm;
+    memcpy(tm.m, state->getTextMat(), sizeof(tm.m));
+
+    double cx = state->getCurX(), cy = state->getCurY(), fs = state->getFontSize(),
+            ry = state->getRise(), h = state->getHorizScaling();
+
+    //cx and cy has been transformed by text matrix, we need to reverse them.
+    tm.invertTo(&itm);
+    double char_cx, char_cy;
+    itm.transform(cx, cy, &char_cx, &char_cy);
+
+    //TODO Vertical? Currently vertical/type3 chars are treated as non-chars.
+    double char_m[6] {fs * h, 0, 0, fs, char_cx + x, char_cy + y + ry};
+
+    double final_m[6];
+    tm_multiply(final_m, tm.m, char_m);
+
+    auto font = state->getFont();
+    double bbox[4] {0, 0, ax, ay};
+    double desc = font->getDescent(), asc = font->getAscent();
+    if (font->getWMode() == 0)
+    {
+        bbox[1] += desc;
+        bbox[3] += asc;
+    }
+    else
+    {//TODO Vertical?
+    }
+    tm_transform_bbox(final_m, bbox);
+    draw_char_bbox(state, bbox);
+}
+
+
+void DrawingTracer::transform_bbox_by_ctm(double * bbox, GfxState * state)
+{
+#if ENABLE_SVG
+    cairo_matrix_t mat;
+    cairo_get_matrix(cairo, &mat);
+    double mat_a[6] {mat.xx, mat.yx, mat.xy, mat.yy, mat.x0, mat.y0};
+    tm_transform_bbox(mat_a, bbox);
+#else
+    tm_transform_bbox(state->getCTM(), bbox);
+#endif
+}
+
+} /* namespace pdf2htmlEX */
diff --git a/src/DrawingTracer.h b/src/DrawingTracer.h
new file mode 100644
index 0000000..2e3159d
--- /dev/null
+++ b/src/DrawingTracer.h
@@ -0,0 +1,79 @@
+/*
+ * DrawingTracer.h
+ *
+ *  Created on: 2014-6-15
+ *      Author: duanyao
+ */
+
+#ifndef DRAWINGTRACER_H__
+#define DRAWINGTRACER_H__
+
+#include <functional>
+
+#include <GfxState.h>
+
+#include "pdf2htmlEX-config.h"
+
+#if ENABLE_SVG
+#include <cairo.h>
+#endif
+
+#include "Param.h"
+
+namespace pdf2htmlEX
+{
+
+class DrawingTracer
+{
+public:
+    /*
+     * The callback to receive drawn event.
+     * bbox in device space.
+     */
+    // a non-char graphics is drawn
+    std::function<void(double * bbox)> on_non_char_drawn;
+    // a char is drawn in the clip area
+    std::function<void(double * bbox)> on_char_drawn;
+    // a char is drawn out of/partially in the clip area
+    std::function<void(double * bbox, bool patially)> on_char_clipped;
+
+    DrawingTracer(const Param & param);
+    virtual ~DrawingTracer();
+    void reset(GfxState * state);
+
+    /*
+     * A character is drawing
+     * x, y: glyph-drawing position, in PDF text object space.
+     * ax, ay: glyph advance, in glyph space.
+     */
+    void draw_char(GfxState * state, double x, double y, double ax, double ay);
+    /*
+     * An image is drawing
+     */
+    void draw_image(GfxState * state);
+    void update_ctm(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32);
+    void clip(GfxState * state, bool even_odd = false);
+    void clip_to_stroke_path(GfxState * state);
+    void fill(GfxState * state, bool even_odd = false);
+    void stroke(GfxState * state);
+    void save();
+    void restore();
+
+private:
+    void finish();
+    // Following methods operate in user space (just before CTM is applied)
+    void do_path(GfxState * state, GfxPath * path);
+    void draw_non_char_bbox(GfxState * state, double * bbox);
+    void draw_char_bbox(GfxState * state, double * bbox);
+    // If cairo is available, parameter state is ignored
+    void transform_bbox_by_ctm(double * bbox, GfxState * state = nullptr);
+
+    const Param & param;
+
+#if ENABLE_SVG
+    cairo_t * cairo;
+#endif
+};
+
+} /* namespace pdf2htmlEX */
+#endif /* DRAWINGTRACER_H__ */
diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h
new file mode 100644
index 0000000..18e395d
--- /dev/null
+++ b/src/HTMLRenderer/HTMLRenderer.h
@@ -0,0 +1,348 @@
+/*
+ * HTMLRenderer.h
+ *
+ * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#ifndef HTMLRENDERER_H_
+#define HTMLRENDERER_H_
+
+#include <unordered_map>
+#include <cstdint>
+#include <fstream>
+#include <memory>
+
+#include <OutputDev.h>
+#include <GfxState.h>
+#include <Stream.h>
+#include <PDFDoc.h>
+#include <goo/gtypes.h>
+#include <Object.h>
+#include <GfxFont.h>
+#include <Annot.h>
+
+// for form.cc
+#include <Page.h>
+#include <Form.h>
+
+#include "pdf2htmlEX-config.h"
+
+#include "Param.h"
+#include "Preprocessor.h"
+#include "StringFormatter.h"
+#include "TmpFiles.h"
+#include "Color.h"
+#include "StateManager.h"
+#include "HTMLTextPage.h"
+
+#include "BackgroundRenderer/BackgroundRenderer.h"
+#include "CoveredTextDetector.h"
+#include "DrawingTracer.h"
+
+#include "util/const.h"
+#include "util/misc.h"
+
+
+namespace pdf2htmlEX {
+
+struct HTMLRenderer : OutputDev
+{
+    HTMLRenderer(const Param & param);
+    virtual ~HTMLRenderer();
+
+    void process(PDFDoc * doc);
+
+    ////////////////////////////////////////////////////
+    // OutputDev interface
+    ////////////////////////////////////////////////////
+    
+    // Does this device use upside-down coordinates?
+    // (Upside-down means (0,0) is the top left corner of the page.)
+    virtual GBool upsideDown() { return gFalse; }
+
+    // Does this device use drawChar() or drawString()?
+    virtual GBool useDrawChar() { return gFalse; }
+
+    // Does this device use functionShadedFill(), axialShadedFill(), and
+    // radialShadedFill()?  If this returns false, these shaded fills
+    // will be reduced to a series of other drawing operations.
+    virtual GBool useShadedFills(int type) { return (type == 2) ? gTrue: gFalse; }
+
+    // Does this device use beginType3Char/endType3Char?  Otherwise,
+    // text in Type 3 fonts will be drawn with drawChar/drawString.
+    virtual GBool interpretType3Chars() { return gFalse; }
+
+    // Does this device need non-text content?
+    virtual GBool needNonText() { return (param.process_nontext) ? gTrue: gFalse; }
+
+    // Does this device need to clip pages to the crop box even when the
+    // box is the crop box?
+    virtual GBool needClipToCropBox() { return gTrue; }
+
+    virtual void setDefaultCTM(double *ctm);
+
+    // Start a page.
+    virtual void startPage(int pageNum, GfxState *state, XRef * xref);
+
+    // End a page.
+    virtual void endPage();
+
+    /*
+     * To optimize false alarms
+     * We just mark as changed, and recheck if they have been changed when we are about to output a new string
+     */
+
+    virtual void restoreState(GfxState * state);
+
+    virtual void saveState(GfxState *state);
+
+    virtual void updateAll(GfxState * state);
+
+    virtual void updateRise(GfxState * state);
+    virtual void updateTextPos(GfxState * state);
+    virtual void updateTextShift(GfxState * state, double shift);
+
+    virtual void updateFont(GfxState * state);
+    virtual void updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32);
+    virtual void updateTextMat(GfxState * state);
+    virtual void updateHorizScaling(GfxState * state);
+
+    virtual void updateCharSpace(GfxState * state);
+    virtual void updateWordSpace(GfxState * state);
+
+    virtual void updateRender(GfxState * state);
+
+    virtual void updateFillColorSpace(GfxState * state);
+    virtual void updateStrokeColorSpace(GfxState * state);
+    virtual void updateFillColor(GfxState * state);
+    virtual void updateStrokeColor(GfxState * state);
+
+
+    /*
+     * Rendering
+     */
+
+    virtual void clip(GfxState * state);
+    virtual void eoClip(GfxState * state);
+    virtual void clipToStrokePath(GfxState * state);
+    
+    virtual void drawString(GfxState * state, GooString * s);
+
+    virtual void drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg);
+
+    virtual void drawSoftMaskedImage(GfxState *state, Object *ref, Stream *str,
+                       int width, int height,
+                       GfxImageColorMap *colorMap,
+                       GBool interpolate,
+                       Stream *maskStr,
+                       int maskWidth, int maskHeight,
+                       GfxImageColorMap *maskColorMap,
+                       GBool maskInterpolate);
+
+    virtual void stroke(GfxState *state); 
+    virtual void fill(GfxState *state);
+    virtual void eoFill(GfxState *state);
+    virtual GBool axialShadedFill(GfxState *state, GfxAxialShading *shading, double tMin, double tMax);
+
+    virtual void processLink(AnnotLink * al);
+
+    /*
+     * Covered text handling.
+     */
+    // Is a char (actually a glyph) covered by non-char's. Index in drawing order in current page.
+    // Does not fail on out-of-bound conditions, but return false.
+    bool is_char_covered(int index);
+    // Currently drawn char (glyph) count in current page.
+    int get_char_count() { return (int)covered_text_detector.get_chars_covered().size(); }
+
+protected:
+    ////////////////////////////////////////////////////
+    // misc
+    ////////////////////////////////////////////////////
+    void pre_process(PDFDoc * doc);
+    void post_process(void);
+
+    void process_outline(void);
+    void process_outline_items(GooList * items);
+
+    void process_form(std::ofstream & out);
+    
+    void set_stream_flags (std::ostream & out);
+
+    void dump_css(void);
+
+    // convert a LinkAction to a string that our Javascript code can understand
+    std::string get_linkaction_str(LinkAction *, std::string & detail);
+
+    ////////////////////////////////////////////////////
+    /*
+     * manage fonts
+     *
+     * In PDF: (install_*)
+     * embedded font: fonts embedded in PDF
+     * external font: fonts that have only names provided in PDF, the viewer should find a local font to match with
+     *
+     * In HTML: (export_*)
+     * remote font: to be retrieved from the web server
+     * remote default font: fallback styles for invalid fonts
+     * local font: to be substituted with a local (client side) font
+     */
+    ////////////////////////////////////////////////////
+    std::string dump_embedded_font(GfxFont * font, FontInfo & info);
+    std::string dump_type3_font(GfxFont * font, FontInfo & info);
+    void embed_font(const std::string & filepath, GfxFont * font, FontInfo & info, bool get_metric_only = false);
+    const FontInfo * install_font(GfxFont * font);
+    void install_embedded_font(GfxFont * font, FontInfo & info);
+    void install_external_font (GfxFont * font, FontInfo & info);
+    void export_remote_font(const FontInfo & info, const std::string & suffix, GfxFont * font);
+    void export_remote_default_font(long long fn_id);
+    void export_local_font(const FontInfo & info, GfxFont * font, const std::string & original_font_name, const std::string & cssfont);
+
+    // depending on --embed***, to embed the content or add a link to it
+    // "type": specify the file type, usually it's the suffix, in which case this parameter could be ""
+    // "copy": indicates whether to copy the file into dest_dir, if not embedded
+    void embed_file(std::ostream & out, const std::string & path, const std::string & type, bool copy);
+
+    ////////////////////////////////////////////////////
+    // state tracking 
+    ////////////////////////////////////////////////////
+    // reset all states
+    void reset_state();
+    // reset all ***_changed flags
+    void reset_state_change();
+    // check updated states, and determine new_line_status
+    // make sure this function can be called several times consecutively without problem
+    void check_state_change(GfxState * state);
+    // prepare the line context, (close old tags, open new tags)
+    // make sure the current HTML style consistent with PDF
+    void prepare_text_line(GfxState * state);
+
+    ////////////////////////////////////////////////////
+    // PDF stuffs
+    ////////////////////////////////////////////////////
+    
+    XRef * xref;
+    PDFDoc * cur_doc;
+    Catalog * cur_catalog;
+    int pageNum;
+
+    double default_ctm[6];
+
+    /*
+     * The content of each page is first scaled with factor1 (>=1), then scale back with factor2(<=1)
+     *
+     * factor1 is use to multiplied with all metrics (height/width/font-size...), in order to improve accuracy
+     * factor2 is applied with css transform, and is exposed to Javascript
+     *
+     * factor1 & factor 2 are determined according to zoom and font-size-multiplier
+     *
+     */
+    double text_zoom_factor (void) const { return text_scale_factor1 * text_scale_factor2; }
+    double text_scale_factor1;
+    double text_scale_factor2;
+
+    // 1px on screen should be printed as print_scale()pt
+    double print_scale (void) const { return 96.0 / DEFAULT_DPI / text_zoom_factor(); }
+
+
+    const Param & param;
+
+    ////////////////////////////////////////////////////
+    // PDF states
+    ////////////////////////////////////////////////////
+    // track the original (unscaled) values to determine scaling and merge lines
+    // current position
+    double cur_tx, cur_ty; // real text position, in text coords
+    double cur_font_size;
+    // this is CTM * TextMAT in PDF
+    // as we'll calculate the position of the origin separately
+    double cur_text_tm[6]; // unscaled
+
+    bool all_changed;
+    bool ctm_changed;
+    bool rise_changed;
+    bool font_changed;
+    bool text_pos_changed; 
+    bool text_mat_changed;
+    bool fill_color_changed;
+    bool hori_scale_changed;
+    bool word_space_changed;
+    bool letter_space_changed;
+    bool stroke_color_changed;
+    bool clip_changed;
+
+    ////////////////////////////////////////////////////
+    // HTML states
+    ////////////////////////////////////////////////////
+
+    // optimize for web
+    // we try to render the final font size directly
+    // to reduce the effect of ctm as much as possible
+    
+    // the actual tm used is `real tm in PDF` scaled by 1/draw_text_scale, 
+    // so everything rendered should be multiplied by draw_text_scale
+    double draw_text_scale; 
+
+    // the position of next char, in text coords
+    // this is actual position (in HTML), which might be different from cur_tx/ty (in PDF)
+    // also keep in mind that they are not the final position, as they will be transform by CTM (also true for cur_tx/ty)
+    double draw_tx, draw_ty; 
+
+
+    ////////////////////////////////////////////////////
+    // styles & resources
+    ////////////////////////////////////////////////////
+    // managers store values actually used in HTML (i.e. scaled)
+    std::unordered_map<long long, FontInfo> font_info_map;
+    AllStateManager all_manager;
+    HTMLTextState cur_text_state;
+    HTMLLineState cur_line_state;
+    HTMLClipState cur_clip_state;
+
+    HTMLTextPage html_text_page;
+
+    enum NewLineState
+    {
+        NLS_NONE,
+        NLS_NEWSTATE, 
+        NLS_NEWLINE,
+        NLS_NEWCLIP
+    } new_line_state;
+    
+    // for font reencoding
+    std::vector<int32_t> cur_mapping; 
+    std::vector<char*> cur_mapping2;
+    std::vector<int> width_list; // width of each char
+
+    Preprocessor preprocessor;
+
+    // manage temporary files
+    TmpFiles tmp_files;
+
+    // for string formatting
+    StringFormatter str_fmt;
+
+    // render background image
+    friend class SplashBackgroundRenderer; // ugly!
+#if ENABLE_SVG
+    friend class CairoBackgroundRenderer; // ugly!
+#endif
+
+    std::unique_ptr<BackgroundRenderer> bg_renderer, fallback_bg_renderer;
+
+    struct {
+        std::ofstream fs;
+        std::string path;
+    } f_outline, f_pages, f_css;
+    std::ofstream * f_curpage;
+    std::string cur_page_filename;
+
+    static const std::string MANIFEST_FILENAME;
+
+    CoveredTextDetector covered_text_detector;
+    DrawingTracer tracer;
+};
+
+} //namespace pdf2htmlEX
+
+#endif /* HTMLRENDERER_H_ */
diff --git a/src/HTMLRenderer/draw.cc b/src/HTMLRenderer/draw.cc
new file mode 100644
index 0000000..6529418
--- /dev/null
+++ b/src/HTMLRenderer/draw.cc
@@ -0,0 +1,65 @@
+/*
+ * Draw.cc
+ *
+ * Handling path drawing
+ *
+ * by WangLu
+ * 2012.10.01
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <sstream>
+#include <vector>
+#include <iostream>
+
+#include "HTMLRenderer.h"
+#include "util/misc.h"
+#include "util/math.h"
+#include "util/namespace.h"
+
+namespace pdf2htmlEX {
+
+using std::swap;
+using std::min;
+using std::max;
+using std::acos;
+using std::asin;
+using std::ostringstream;
+using std::sqrt;
+using std::vector;
+using std::ostream;
+
+void HTMLRenderer::restoreState(GfxState * state)
+{
+    updateAll(state);
+    tracer.restore();
+}
+
+void HTMLRenderer::saveState(GfxState *state)
+{
+    tracer.save();
+}
+
+void HTMLRenderer::stroke(GfxState * state)
+{
+    tracer.stroke(state);
+}
+
+void HTMLRenderer::fill(GfxState * state)
+{
+    tracer.fill(state);
+}
+
+void HTMLRenderer::eoFill(GfxState * state)
+{
+    tracer.fill(state, true);
+}
+
+GBool HTMLRenderer::axialShadedFill(GfxState *state, GfxAxialShading *shading, double tMin, double tMax)
+{
+    tracer.fill(state); //TODO correct?
+    return true;
+}
+
+} // namespace pdf2htmlEX
diff --git a/src/HTMLRenderer/font.cc b/src/HTMLRenderer/font.cc
new file mode 100644
index 0000000..10ff215
--- /dev/null
+++ b/src/HTMLRenderer/font.cc
@@ -0,0 +1,1089 @@
+/*
+ * font.cc
+ *
+ * Font processing
+ *
+ * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#include <iostream>
+#include <cmath>
+#include <algorithm>
+#include <sstream>
+#include <cctype>
+#include <unordered_set>
+
+#include <GlobalParams.h>
+#include <fofi/FoFiTrueType.h>
+#include <CharCodeToUnicode.h>
+
+#include "Param.h"
+#include "HTMLRenderer.h"
+#include "Base64Stream.h"
+
+#include "pdf2htmlEX-config.h"
+
+#include "util/namespace.h"
+#include "util/math.h"
+#include "util/misc.h"
+#include "util/ffw.h"
+#include "util/path.h"
+#include "util/unicode.h"
+#include "util/css_const.h"
+
+#if ENABLE_SVG
+#include <cairo.h>
+#include <cairo-ft.h>
+#include <cairo-svg.h>
+#include "CairoFontEngine.h"
+#include "CairoOutputDev.h"
+#include <Gfx.h>
+#endif
+
+namespace pdf2htmlEX {
+
+using std::min;
+using std::unordered_set;
+using std::cerr;
+using std::endl;
+
+string HTMLRenderer::dump_embedded_font (GfxFont * font, FontInfo & info)
+{
+    if(info.is_type3)
+        return dump_type3_font(font, info);
+
+    Object obj, obj1, obj2;
+    Object font_obj, font_obj2, fontdesc_obj;
+    string suffix;
+    string filepath;
+
+    long long fn_id = info.id;
+
+    try
+    {
+        // inspired by mupdf 
+        string subtype;
+
+        auto * id = font->getID();
+
+        Object ref_obj;
+        ref_obj.initRef(id->num, id->gen);
+        ref_obj.fetch(xref, &font_obj);
+        ref_obj.free();
+
+        if(!font_obj.isDict())
+        {
+            cerr << "Font object is not a dictionary" << endl;
+            throw 0;
+        }
+
+        Dict * dict = font_obj.getDict();
+        if(dict->lookup("DescendantFonts", &font_obj2)->isArray())
+        {
+            if(font_obj2.arrayGetLength() == 0)
+            {
+                cerr << "Warning: empty DescendantFonts array" << endl;
+            }
+            else
+            {
+                if(font_obj2.arrayGetLength() > 1)
+                    cerr << "TODO: multiple entries in DescendantFonts array" << endl;
+
+                if(font_obj2.arrayGet(0, &obj2)->isDict())
+                {
+                    dict = obj2.getDict();
+                }
+            }
+        }
+
+        if(!dict->lookup("FontDescriptor", &fontdesc_obj)->isDict())
+        {
+            cerr << "Cannot find FontDescriptor " << endl;
+            throw 0;
+        }
+
+        dict = fontdesc_obj.getDict();
+
+        if(dict->lookup("FontFile3", &obj)->isStream())
+        {
+            if(obj.streamGetDict()->lookup("Subtype", &obj1)->isName())
+            {
+                subtype = obj1.getName();
+                if(subtype == "Type1C")
+                {
+                    suffix = ".cff";
+                }
+                else if (subtype == "CIDFontType0C")
+                {
+                    suffix = ".cid";
+                }
+                else if (subtype == "OpenType")
+                {
+                    suffix = ".otf";
+                }
+                else
+                {
+                    cerr << "Unknown subtype: " << subtype << endl;
+                    throw 0;
+                }
+            }
+            else
+            {
+                cerr << "Invalid subtype in font descriptor" << endl;
+                throw 0;
+            }
+        }
+        else if (dict->lookup("FontFile2", &obj)->isStream())
+        { 
+            suffix = ".ttf";
+        }
+        else if (dict->lookup("FontFile", &obj)->isStream())
+        {
+            suffix = ".pfa";
+        }
+        else
+        {
+            cerr << "Cannot find FontFile for dump" << endl;
+            throw 0;
+        }
+
+        if(suffix == "")
+        {
+            cerr << "Font type unrecognized" << endl;
+            throw 0;
+        }
+
+        obj.streamReset();
+
+        filepath = (char*)str_fmt("%s/f%llx%s", param.tmp_dir.c_str(), fn_id, suffix.c_str());
+        tmp_files.add(filepath);
+
+        ofstream outf(filepath, ofstream::binary);
+        if(!outf)
+            throw string("Cannot open file ") + filepath + " for writing";
+
+        char buf[1024];
+        int len;
+        while((len = obj.streamGetChars(1024, (Guchar*)buf)) > 0)
+        {
+            outf.write(buf, len);
+        }
+        obj.streamClose();
+    }
+    catch(int) 
+    {
+        cerr << "Something wrong when trying to dump font " << hex << fn_id << dec << endl;
+    }
+
+    obj2.free();
+    obj1.free();
+    obj.free();
+
+    fontdesc_obj.free();
+    font_obj2.free();
+    font_obj.free();
+
+    return filepath;
+}
+
+string HTMLRenderer::dump_type3_font (GfxFont * font, FontInfo & info)
+{
+    assert(info.is_type3);
+
+#if ENABLE_SVG
+    long long fn_id = info.id;
+
+    FT_Library ft_lib;
+    FT_Init_FreeType(&ft_lib);
+    CairoFontEngine font_engine(ft_lib); 
+    auto * cur_font = font_engine.getFont(font, cur_doc, true, xref);
+    auto used_map = preprocessor.get_code_map(hash_ref(font->getID()));
+
+    //calculate transformed metrics
+    double * font_bbox = font->getFontBBox();
+    double * font_matrix = font->getFontMatrix();
+    double transformed_bbox[4];
+    memcpy(transformed_bbox, font_bbox, 4 * sizeof(double));
+    /*
+    // add the origin to the bbox
+    if(transformed_bbox[0] > 0) transformed_bbox[0] = 0;
+    if(transformed_bbox[1] > 0) transformed_bbox[1] = 0;
+    if(transformed_bbox[2] < 0) transformed_bbox[2] = 0;
+    if(transformed_bbox[3] < 0) transformed_bbox[3] = 0;
+    */
+    tm_transform_bbox(font_matrix, transformed_bbox);
+    double transformed_bbox_width = transformed_bbox[2] - transformed_bbox[0];
+    double transformed_bbox_height = transformed_bbox[3] - transformed_bbox[1];
+    info.font_size_scale = std::max(transformed_bbox_width, transformed_bbox_height);
+
+    // we want the glyphs is rendered in a box of size around GLYPH_DUMP_EM_SIZE x GLYPH_DUMP_EM_SIZE
+    // for rectangles, the longer edge should be GLYPH_DUMP_EM_SIZE
+    const double GLYPH_DUMP_EM_SIZE = 100.0;
+    double scale = GLYPH_DUMP_EM_SIZE / info.font_size_scale;
+
+    // we choose ttf as it does not use char names
+    // or actually we don't use char names for ttf (see embed_font)
+    ffw_new_font();
+    // dump each glyph into svg and combine them
+    for(int code = 0; code < 256; ++code)
+    {
+        if(!used_map[code]) continue;
+
+        cairo_surface_t * surface = nullptr;
+
+        string glyph_filename = (char*)str_fmt("%s/f%llx-%x.svg", param.tmp_dir.c_str(), fn_id, code);
+        tmp_files.add(glyph_filename);
+
+        surface = cairo_svg_surface_create(glyph_filename.c_str(), transformed_bbox_width * scale, transformed_bbox_height * scale);
+
+        cairo_svg_surface_restrict_to_version(surface, CAIRO_SVG_VERSION_1_2);
+        cairo_surface_set_fallback_resolution(surface, param.h_dpi, param.v_dpi);
+        cairo_t * cr = cairo_create(surface);
+
+        // track the position of the origin
+        double ox, oy;
+        ox = oy = 0.0;
+
+        auto glyph_width = ((Gfx8BitFont*)font)->getWidth(code);
+
+#if 1
+        {
+            // pain the glyph
+            cairo_set_font_face(cr, cur_font->getFontFace());
+
+            cairo_matrix_t m1, m2, m3;
+            // set up m1
+            // m1 shift the bottom-left corner of the glyph bbox to the origin
+            // also set font size to scale
+            cairo_matrix_init_translate(&m1, -transformed_bbox[0], transformed_bbox[1]);
+            cairo_matrix_init_scale(&m2, scale, scale);
+            cairo_matrix_multiply(&m1, &m1, &m2);
+            cairo_set_font_matrix(cr, &m1);
+
+            cairo_glyph_t glyph;
+            glyph.index = cur_font->getGlyph(code, nullptr, 0);
+            glyph.x = 0;
+            glyph.y = GLYPH_DUMP_EM_SIZE;
+            cairo_show_glyphs(cr, &glyph, 1);
+
+
+            // apply the type 3 font's font matrix before m1
+            // such that we got the mapping from type 3 font space to user space, then we will be able to calculate mapped position for ox,oy and glyph_width
+            cairo_matrix_init(&m2, font_matrix[0], font_matrix[1], font_matrix[2], font_matrix[3], font_matrix[4], font_matrix[5]);
+            cairo_matrix_init_scale(&m3, 1, -1);
+            cairo_matrix_multiply(&m2, &m2, &m3);
+            cairo_matrix_multiply(&m2, &m2, &m1);
+
+            cairo_matrix_transform_point(&m2, &ox, &oy);
+            double dummy = 0;
+            cairo_matrix_transform_distance(&m2, &glyph_width, &dummy);
+        }
+#else
+        {
+            // manually draw the char to get the metrics
+            // adapted from _render_type3_glyph of poppler
+            cairo_matrix_t ctm, m, m1;
+            cairo_matrix_init_identity(&ctm);
+
+            // apply font-matrix
+            cairo_matrix_init(&m, font_matrix[0], font_matrix[1], font_matrix[2], font_matrix[3], font_matrix[4], font_matrix[5]);
+            cairo_matrix_multiply(&ctm, &ctm, &m);
+
+            // shift origin
+            cairo_matrix_init_translate(&m1, -transformed_bbox[0], -transformed_bbox[1]);
+            cairo_matrix_multiply(&ctm, &ctm, &m1);
+
+            // make it upside down since the difference between the glyph coordination and cairo coordination
+            cairo_matrix_init_scale(&m1, 1, -1);
+            cairo_matrix_multiply(&ctm, &ctm, &m1);
+            // save m*m1 to m1 for later use
+            cairo_matrix_multiply(&m1, &m, &m1);
+
+            // shift up to the bounding box
+            cairo_matrix_init_translate(&m, 0.0, transformed_bbox_height);
+            cairo_matrix_multiply(&ctm, &ctm, &m);
+
+            // scale up 
+            cairo_matrix_init_scale(&m, scale, scale);
+            cairo_matrix_multiply(&ctm, &ctm, &m);
+
+            // set ctm
+            cairo_set_matrix(cr, &ctm);
+
+            // calculate the position of origin
+            cairo_matrix_transform_point(&ctm, &ox, &oy);
+            oy -= transformed_bbox_height * scale;
+            // calculate glyph width
+            double dummy = 0;
+            cairo_matrix_transform_distance(&ctm, &glyph_width, &dummy);
+
+            // draw the glyph
+            auto output_dev = new CairoOutputDev();
+            output_dev->setCairo(cr);
+            output_dev->setPrinting(true);
+
+            PDFRectangle box;
+            box.x1 = font_bbox[0];
+            box.y1 = font_bbox[1];
+            box.x2 = font_bbox[2];
+            box.y2 = font_bbox[3];
+            auto gfx = new Gfx(cur_doc, output_dev, 
+                    ((Gfx8BitFont*)font)->getResources(),
+                    &box, nullptr);
+            output_dev->startDoc(cur_doc, &font_engine);
+            output_dev->startPage(1, gfx->getState(), gfx->getXRef());
+            output_dev->setInType3Char(gTrue);
+            auto char_procs = ((Gfx8BitFont*)font)->getCharProcs();
+            Object char_proc_obj;
+            auto glyph_index = cur_font->getGlyph(code, nullptr, 0);
+            gfx->display(char_procs->getVal(glyph_index, &char_proc_obj));
+
+            char_proc_obj.free();
+            delete gfx;
+            delete output_dev;
+        }
+#endif
+
+        {
+            auto status = cairo_status(cr);
+            cairo_destroy(cr);
+            if(status)
+                throw string("Cairo error: ") + cairo_status_to_string(status);
+        }
+        cairo_surface_finish(surface);
+        {
+            auto status = cairo_surface_status(surface);
+            cairo_surface_destroy(surface);
+            surface = nullptr;
+            if(status)
+                throw string("Error in cairo: ") + cairo_status_to_string(status);
+        }
+
+        ffw_import_svg_glyph(code, glyph_filename.c_str(), ox / GLYPH_DUMP_EM_SIZE, -oy / GLYPH_DUMP_EM_SIZE, glyph_width / GLYPH_DUMP_EM_SIZE);
+    }
+
+    string font_filename = (char*)str_fmt("%s/f%llx.ttf", param.tmp_dir.c_str(), fn_id);
+    tmp_files.add(font_filename);
+    ffw_save(font_filename.c_str());
+    ffw_close();
+
+    return font_filename;
+#else
+    return "";
+#endif
+}
+
+void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo & info, bool get_metric_only)
+{
+    if(param.debug)
+    {
+        cerr << "Embed font: " << filepath << " " << info.id << endl;
+    }
+
+    ffw_load_font(filepath.c_str());
+    ffw_prepare_font();
+
+    if(param.debug)
+    {
+        auto fn = str_fmt("%s/__raw_font_%llx%s", param.tmp_dir.c_str(), info.id, get_suffix(filepath).c_str());
+        tmp_files.add((char*)fn);
+        ofstream((char*)fn, ofstream::binary) << ifstream(filepath).rdbuf();
+    }
+
+    int * code2GID = nullptr;
+    int code2GID_len = 0;
+    int maxcode = 0;
+
+    Gfx8BitFont * font_8bit = nullptr;
+    GfxCIDFont * font_cid = nullptr;
+
+    string suffix = get_suffix(filepath);
+    for(auto & c : suffix)
+        c = tolower(c);
+
+    /*
+     * if parm->tounicode is 0, try the provided tounicode map first
+     */
+    info.use_tounicode = (param.tounicode >= 0);
+    bool has_space = false;
+
+    const char * used_map = nullptr;
+
+    info.em_size = ffw_get_em_size();
+
+    if(param.debug)
+    {
+        cerr << "em size: " << info.em_size << endl;
+    }
+
+    info.space_width = 0;
+
+    if(!font->isCIDFont())
+    {
+        font_8bit = dynamic_cast<Gfx8BitFont*>(font);
+    }
+    else
+    {
+        font_cid = dynamic_cast<GfxCIDFont*>(font);
+    }
+
+    if(get_metric_only)
+    {
+        ffw_fix_metric();
+        ffw_get_metric(&info.ascent, &info.descent);
+        ffw_close();
+        return;
+    }
+
+    used_map = preprocessor.get_code_map(hash_ref(font->getID()));
+
+    /*
+     * Step 1
+     * dump the font file directly from the font descriptor and put the glyphs into the correct slots *
+     *
+     * for 8bit + nonTrueType
+     * re-encoding the font by glyph names
+     *
+     * for 8bit + TrueType
+     * sort the glpyhs as the original order, and load the code2GID table
+     * later we will map GID (instead of char code) to Unicode
+     *
+     * for CID + nonTrueType
+     * Flatten the font 
+     *
+     * for CID Truetype
+     * same as 8bitTrueType, except for that we have to check 65536 charcodes
+     * use the embedded code2GID table if there is, otherwise use the one in the font
+     */
+    if(font_8bit)
+    {
+        maxcode = 0xff;
+        if(is_truetype_suffix(suffix))
+        {
+            if(info.is_type3)
+            {
+                /*
+                 * Type 3 fonts are saved and converted into ttf fonts
+                 * encoded based on code points instead of GID
+                 *
+                 * I thought code2GID would work but it never works, and I don't know why
+                 * Anyway we can disable code2GID such that the following procedure will be working based on code points instead of GID
+                 */
+            }
+            else
+            {
+                ffw_reencode_glyph_order();
+                if(FoFiTrueType * fftt = FoFiTrueType::load((char*)filepath.c_str()))
+                {
+                    code2GID = font_8bit->getCodeToGIDMap(fftt);
+                    code2GID_len = 256;
+                    delete fftt;
+                }
+            }
+        }
+        else
+        {
+            // move the slot such that it's consistent with the encoding seen in PDF
+            unordered_set<string> nameset;
+            bool name_conflict_warned = false;
+
+            std::fill(cur_mapping2.begin(), cur_mapping2.end(), (char*)nullptr);
+
+            for(int i = 0; i < 256; ++i)
+            {
+                if(!used_map[i]) continue;
+
+                auto cn = font_8bit->getCharName(i);
+                if(cn == nullptr)
+                {
+                    continue;
+                }
+                else
+                {
+                    if(nameset.insert(string(cn)).second)
+                    {
+                        cur_mapping2[i] = cn;    
+                    }
+                    else
+                    {
+                        if(!name_conflict_warned)
+                        {
+                            name_conflict_warned = true;
+                            //TODO: may be resolved using advanced font properties?
+                            cerr << "Warning: encoding conflict detected in font: " << hex << info.id << dec << endl;
+                        }
+                    }
+                }
+            }
+
+            ffw_reencode_raw2(cur_mapping2.data(), 256, 0);
+        }
+    }
+    else
+    {
+        maxcode = 0xffff;
+
+        if(is_truetype_suffix(suffix))
+        {
+            ffw_reencode_glyph_order();
+
+            GfxCIDFont * _font = dynamic_cast<GfxCIDFont*>(font);
+
+            // To locate CID2GID for the font
+            // as in CairoFontEngine.cc
+            if((code2GID = _font->getCIDToGID()))
+            {
+                // use the mapping stored in _font
+                code2GID_len = _font->getCIDToGIDLen();
+            }
+            else
+            {
+                // use the mapping stored in the file
+                if(FoFiTrueType * fftt = FoFiTrueType::load((char*)filepath.c_str()))
+                {
+                    code2GID = _font->getCodeToGIDMap(fftt, &code2GID_len);
+                    delete fftt;
+                }
+            }
+        }
+        else
+        {
+            // TODO: add an option to load the table?
+            ffw_cidflatten();
+        }
+    }
+
+    /*
+     * Step 2
+     * - map charcode (or GID for CID truetype)
+     *
+     * -> Always map to Unicode for 8bit TrueType fonts and CID fonts
+     *
+     * -> For 8bit nonTruetype fonts:
+     *   Try to calculate the correct Unicode value from the glyph names, when collision is detected in ToUnicode Map
+     * 
+     * - Fill in the width_list, and set widths accordingly
+     */
+
+
+    {
+        string map_filename;
+        ofstream map_outf;
+        if(param.debug)
+        {
+            map_filename = (char*)str_fmt("%s/f%llx.map", param.tmp_dir.c_str(), info.id);
+            tmp_files.add(map_filename);
+            map_outf.open(map_filename);
+        }
+
+        unordered_set<int> codeset;
+        bool name_conflict_warned = false;
+
+        auto ctu = font->getToUnicode();
+        std::fill(cur_mapping.begin(), cur_mapping.end(), -1);
+        std::fill(width_list.begin(), width_list.end(), -1);
+
+        if(code2GID)
+            maxcode = min<int>(maxcode, code2GID_len - 1);
+
+        bool is_truetype = is_truetype_suffix(suffix);
+        int max_key = maxcode;
+        /*
+         * Traverse all possible codes
+         */
+        bool retried = false; // avoid infinite loop
+        for(int cur_code = 0; cur_code <= maxcode; ++cur_code)
+        {
+            if(!used_map[cur_code])
+                continue;
+
+            /*
+             * Skip glyphs without names (only for non-ttf fonts)
+             */
+            if(!is_truetype && (font_8bit != nullptr) 
+                    && (font_8bit->getCharName(cur_code) == nullptr))
+            {
+                continue;
+            }
+
+            int mapped_code = cur_code;
+            if(code2GID)
+            {
+                // for fonts with GID (e.g. TTF) we need to map GIDs instead of codes
+                if((mapped_code = code2GID[cur_code]) == 0) continue;
+            }
+
+            if(mapped_code > max_key)
+                max_key = mapped_code;
+
+            Unicode u, *pu=&u;
+            if(info.use_tounicode)
+            {
+                int n = ctu ? (ctu->mapToUnicode(cur_code, &pu)) : 0;
+                u = check_unicode(pu, n, cur_code, font);
+            }
+            else
+            {
+                u = unicode_from_font(cur_code, font);
+            }
+
+            if(codeset.insert(u).second)
+            {
+                cur_mapping[mapped_code] = u;
+            }
+            else
+            {
+                // collision detected
+                if(param.tounicode == 0)
+                {
+                    // in auto mode, just drop the tounicode map
+                    if(!retried)
+                    {
+                        cerr << "ToUnicode CMap is not valid and got dropped for font: " << hex << info.id << dec << endl;
+                        retried = true;
+                        codeset.clear();
+                        info.use_tounicode = false;
+                        std::fill(cur_mapping.begin(), cur_mapping.end(), -1);
+                        std::fill(width_list.begin(), width_list.end(), -1);
+                        cur_code = -1;
+                        if(param.debug)
+                        {
+                            map_outf.close();
+                            map_outf.open(map_filename);
+                        }
+                        continue;
+                    }
+                }
+                if(!name_conflict_warned)
+                {
+                    name_conflict_warned = true;
+                    //TODO: may be resolved using advanced font properties?
+                    cerr << "Warning: encoding confliction detected in font: " << hex << info.id << dec << endl;
+                }
+            }
+
+            {
+                double cur_width = 0;
+                if(font_8bit)
+                {
+                    cur_width = font_8bit->getWidth(cur_code);
+                }
+                else
+                {
+                    char buf[2];  
+                    buf[0] = (cur_code >> 8) & 0xff;
+                    buf[1] = (cur_code & 0xff);
+                    cur_width = font_cid->getWidth(buf, 2) ;
+                }
+
+                cur_width /= info.font_size_scale;
+
+                if(u == ' ')
+                {
+                    /*
+                     * Internet Explorer will ignore `word-spacing` if
+                     * the width of the 'space' glyph is 0
+                     *
+                     * space_width==0 often means no spaces are used in the PDF
+                     * so setting it to be 0.001 should be safe
+                     */
+                    if(equal(cur_width, 0))
+                        cur_width = 0.001;
+
+                    info.space_width = cur_width;
+                    has_space = true;
+                }
+                
+                width_list[mapped_code] = (int)floor(cur_width * info.em_size + 0.5);
+            }
+
+            if(param.debug)
+            {
+                map_outf << hex << cur_code << ' ' << mapped_code << ' ' << u << endl;
+            }
+        }
+
+        ffw_set_widths(width_list.data(), max_key + 1, param.stretch_narrow_glyph, param.squeeze_wide_glyph);
+        
+        ffw_reencode_raw(cur_mapping.data(), max_key + 1, 1);
+
+        // In some space offsets in HTML, we insert a ' ' there in order to improve text copy&paste
+        // We need to make sure that ' ' is in the font, otherwise it would be very ugly if you select the text
+        // Might be a problem if ' ' is in the font, but not empty
+        if(!has_space)
+        {
+            if(font_8bit)
+            {
+                info.space_width = font_8bit->getWidth(' ');
+            }
+            else
+            {
+                char buf[2] = {0, ' '};
+                info.space_width = font_cid->getWidth(buf, 2);
+            }
+            info.space_width /= info.font_size_scale;
+
+            /* See comments above */
+            if(equal(info.space_width,0))
+                info.space_width = 0.001;
+
+            ffw_add_empty_char((int32_t)' ', (int)floor(info.space_width * info.em_size + 0.5));
+            if(param.debug)
+            {
+                cerr << "Missing space width in font " << hex << info.id << ": set to " << dec << info.space_width << endl;
+            }
+        }
+
+        if(param.debug)
+        {
+            cerr << "space width: " << info.space_width << endl;
+        }
+
+        if(ctu)
+            ctu->decRefCnt();
+    }
+
+    /*
+     * Step 3
+     * Generate the font as desired
+     */
+
+    // Reencode to Unicode Full such that FontForge won't ditch unicode values larger than 0xFFFF
+    ffw_reencode_unicode_full();
+
+    // Due to a bug of Fontforge about pfa -> woff conversion
+    // we always generate TTF first, instead of the format specified by user
+    string cur_tmp_fn = (char*)str_fmt("%s/__tmp_font1.%s", param.tmp_dir.c_str(), "ttf");
+    tmp_files.add(cur_tmp_fn);
+    string other_tmp_fn = (char*)str_fmt("%s/__tmp_font2.%s", param.tmp_dir.c_str(), "ttf");
+    tmp_files.add(other_tmp_fn);
+
+    ffw_save(cur_tmp_fn.c_str());
+
+    ffw_close();
+
+    /*
+     * Step 4
+     * Font Hinting
+     */
+    bool hinted = false;
+
+    // Call external hinting program if specified 
+    if(param.external_hint_tool != "")
+    {
+        hinted = (system((char*)str_fmt("%s \"%s\" \"%s\"", param.external_hint_tool.c_str(), cur_tmp_fn.c_str(), other_tmp_fn.c_str())) == 0);
+    }
+
+    // Call internal hinting procedure if specified 
+    if((!hinted) && (param.auto_hint))
+    {
+        ffw_load_font(cur_tmp_fn.c_str());
+        ffw_auto_hint();
+        ffw_save(other_tmp_fn.c_str());
+        ffw_close();
+        hinted = true;
+    }
+
+    if(hinted)
+    {
+        swap(cur_tmp_fn, other_tmp_fn);
+    }
+
+    /* 
+     * Step 5 
+     * Generate the font, load the metrics and set the embedding bits (fstype)
+     *
+     * Ascent/Descent are not used in PDF, and the values in PDF may be wrong or inconsistent (there are 3 sets of them)
+     * We need to reload in order to retrieve/fix accurate ascent/descent, some info won't be written to the font by fontforge until saved.
+     */
+    string fn = (char*)str_fmt("%s/f%llx.%s", 
+        (param.embed_font ? param.tmp_dir : param.dest_dir).c_str(),
+        info.id, param.font_format.c_str());
+
+    if(param.embed_font)
+        tmp_files.add(fn);
+
+    ffw_load_font(cur_tmp_fn.c_str());
+    ffw_fix_metric();
+    ffw_get_metric(&info.ascent, &info.descent);
+    if(param.override_fstype)
+        ffw_override_fstype();
+    ffw_save(fn.c_str());
+
+    ffw_close();
+}
+
+
+const FontInfo * HTMLRenderer::install_font(GfxFont * font)
+{
+    assert(sizeof(long long) == 2*sizeof(int));
+                
+    long long fn_id = (font == nullptr) ? 0 : hash_ref(font->getID());
+
+    auto iter = font_info_map.find(fn_id);
+    if(iter != font_info_map.end())
+        return &(iter->second);
+
+    long long new_fn_id = font_info_map.size(); 
+
+    auto cur_info_iter = font_info_map.insert(make_pair(fn_id, FontInfo())).first;
+
+    FontInfo & new_font_info = cur_info_iter->second;
+    new_font_info.id = new_fn_id;
+    new_font_info.use_tounicode = true;
+    new_font_info.font_size_scale = 1.0;
+
+    if(font == nullptr)
+    {
+        new_font_info.em_size = 0;
+        new_font_info.space_width = 0;
+        new_font_info.ascent = 0;
+        new_font_info.descent = 0;
+        new_font_info.is_type3 = false;
+
+        export_remote_default_font(new_fn_id);
+
+        return &(new_font_info);
+    }
+
+    new_font_info.ascent = font->getAscent();
+    new_font_info.descent = font->getDescent();
+    new_font_info.is_type3 = (font->getType() == fontType3);
+
+    if(param.debug)
+    {
+        cerr << "Install font " << hex << new_fn_id << dec
+            << ": (" << (font->getID()->num) << ' ' << (font->getID()->gen) << ") " 
+            << (font->getName() ? font->getName()->getCString() : "")
+            << endl;
+    }
+
+    if(new_font_info.is_type3)
+    {
+#if ENABLE_SVG
+        if(param.process_type3)
+        {
+            install_embedded_font(font, new_font_info);
+        }
+        else
+        {
+            export_remote_default_font(new_fn_id);
+        }
+#else
+        cerr << "Type 3 fonts are unsupported and will be rendered as Image" << endl;
+        export_remote_default_font(new_fn_id);
+#endif
+        return &new_font_info;
+    }
+    if(font->getWMode()) {
+        cerr << "Writing mode is unsupported and will be rendered as Image" << endl;
+        export_remote_default_font(new_fn_id);
+        return &new_font_info;
+    }
+
+    /*
+     * The 2nd parameter of locateFont should be true only for PS
+     * which does not make much sense in our case
+     * If we specify gFalse here, font_loc->locaType cannot be gfxFontLocResident
+     */
+    if(auto * font_loc = font->locateFont(xref, nullptr))
+    {
+        switch(font_loc -> locType)
+        {
+            case gfxFontLocEmbedded:
+                install_embedded_font(font, new_font_info);
+                break;
+            case gfxFontLocResident:
+                std::cerr << "Warning: Base 14 fonts should not be specially handled now. Please report a bug!" << std::endl;
+                /* fall through */
+            case gfxFontLocExternal:
+                install_external_font(font, new_font_info);
+                break;
+            default:
+                cerr << "TODO: other font loc" << endl;
+                export_remote_default_font(new_fn_id);
+                break;
+        }      
+        delete font_loc;
+    }
+    else
+    {
+        export_remote_default_font(new_fn_id);
+    }
+      
+    return &new_font_info;
+}
+
+void HTMLRenderer::install_embedded_font(GfxFont * font, FontInfo & info)
+{
+    auto path = dump_embedded_font(font, info);
+
+    if(path != "")
+    {
+        embed_font(path, font, info);
+        export_remote_font(info, param.font_format, font);
+    }
+    else
+    {
+        export_remote_default_font(info.id);
+    }
+}
+
+void HTMLRenderer::install_external_font(GfxFont * font, FontInfo & info)
+{
+    string fontname(font->getName()->getCString());
+
+    // resolve bad encodings in GB
+    auto iter = GB_ENCODED_FONT_NAME_MAP.find(fontname); 
+    if(iter != GB_ENCODED_FONT_NAME_MAP.end())
+    {
+        fontname = iter->second;
+        cerr << "Warning: workaround for font names in bad encodings." << endl;
+    }
+
+    GfxFontLoc * localfontloc = font->locateFont(xref, nullptr);
+
+    if(param.embed_external_font)
+    {
+        if(localfontloc != nullptr)
+        {
+            embed_font(string(localfontloc->path->getCString()), font, info);
+            export_remote_font(info, param.font_format, font);
+            delete localfontloc;
+            return;
+        }
+        else
+        {
+            cerr << "Cannot embed external font: f" << hex << info.id << dec << ' ' << fontname << endl;
+            // fallback to exporting by name
+        }
+    }
+
+    // still try to get an idea of read ascent/descent
+    if(localfontloc != nullptr)
+    {
+        // fill in ascent/descent only, do not embed
+        embed_font(string(localfontloc->path->getCString()), font, info, true);
+        delete localfontloc;
+    }
+    else
+    {
+        info.ascent = font->getAscent();
+        info.descent = font->getDescent();
+    }
+
+    export_local_font(info, font, fontname, "");
+}
+
+void HTMLRenderer::export_remote_font(const FontInfo & info, const string & format, GfxFont * font)
+{
+    string css_font_format;
+    if(format == "ttf")
+    {
+        css_font_format = "truetype";
+    }
+    else if(format == "otf")
+    {
+        css_font_format = "opentype";
+    }
+    else if(format == "woff")
+    {
+        css_font_format = "woff";
+    }
+    else if(format == "eot")
+    {
+        css_font_format = "embedded-opentype";
+    }
+    else if(format == "svg")
+    {
+        css_font_format = "svg";
+    }
+    else
+    {
+        throw string("Warning: unknown font format: ") + format;
+    }
+    auto iter = FORMAT_MIME_TYPE_MAP.find(format);
+    if(iter == FORMAT_MIME_TYPE_MAP.end())
+    {
+        throw string("Warning: unknown font format: ") + format;
+    }
+    string mime_type = iter->second;
+
+    f_css.fs << "@font-face{"
+             << "font-family:" << CSS::FONT_FAMILY_CN << info.id << ";"
+             << "src:url(";
+
+    {
+        auto fn = str_fmt("f%llx.%s", info.id, format.c_str());
+        if(param.embed_font)
+        {
+            auto path = param.tmp_dir + "/" + (char*)fn;
+            ifstream fin(path, ifstream::binary);
+            if(!fin)
+                throw "Cannot locate font file: " + path;
+            f_css.fs << "'data:" + mime_type + ";base64," << Base64Stream(fin) << "'";
+        }
+        else
+        {
+            f_css.fs << (char*)fn;
+        }
+    }
+
+    f_css.fs << ")"
+             << "format(\"" << css_font_format << "\");"
+             << "}" // end of @font-face
+             << "." << CSS::FONT_FAMILY_CN << info.id << "{"
+             << "font-family:" << CSS::FONT_FAMILY_CN << info.id << ";"
+             << "line-height:" << round(info.ascent - info.descent) << ";"
+             << "font-style:normal;"
+             << "font-weight:normal;"
+             << "visibility:visible;"
+             << "}" 
+             << endl;
+}
+
+static string general_font_family(GfxFont * font)
+{
+    if(font->isFixedWidth())
+        return "monospace";
+    else if (font->isSerif())
+        return "serif";
+    else
+        return "sans-serif";
+}
+
+// TODO: this function is called when some font is unable to process, may use the name there as a hint
+void HTMLRenderer::export_remote_default_font(long long fn_id) 
+{
+    f_css.fs << "." << CSS::FONT_FAMILY_CN << fn_id << "{font-family:sans-serif;visibility:hidden;}" << endl;
+}
+
+void HTMLRenderer::export_local_font(const FontInfo & info, GfxFont * font, const string & original_font_name, const string & cssfont) 
+{
+    f_css.fs << "." << CSS::FONT_FAMILY_CN << info.id << "{";
+    f_css.fs << "font-family:" << ((cssfont == "") ? (original_font_name + "," + general_font_family(font)) : cssfont) << ";";
+
+    string fn = original_font_name;
+    for(auto & c : fn)
+        c = tolower(c);
+
+    if(font->isBold() || (fn.find("bold") != string::npos))
+        f_css.fs << "font-weight:bold;";
+    else
+        f_css.fs << "font-weight:normal;";
+
+    if(fn.find("oblique") != string::npos)
+        f_css.fs << "font-style:oblique;";
+    else if(font->isItalic() || (fn.find("italic") != string::npos))
+        f_css.fs << "font-style:italic;";
+    else
+        f_css.fs << "font-style:normal;";
+
+    f_css.fs << "line-height:" << round(info.ascent - info.descent) << ";";
+
+    f_css.fs << "visibility:visible;";
+
+    f_css.fs << "}" << endl;
+}
+
+} //namespace pdf2htmlEX
diff --git a/src/HTMLRenderer/form.cc b/src/HTMLRenderer/form.cc
new file mode 100644
index 0000000..6b51622
--- /dev/null
+++ b/src/HTMLRenderer/form.cc
@@ -0,0 +1,76 @@
+/*
+ * form.cc
+ *
+ * Handling Forms
+ *
+ * by Simon Chenard
+ * 2014.07.25
+ */
+
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include "HTMLRenderer.h"
+#include "util/namespace.h"
+#include "util/misc.h"
+
+namespace pdf2htmlEX {
+   
+using std::ofstream;
+using std::cerr;
+
+void HTMLRenderer::process_form(ofstream & out)
+{
+    FormPageWidgets * widgets = cur_catalog->getPage(pageNum)->getFormWidgets();
+    int num = widgets->getNumWidgets();
+
+    for(int i = 0; i < num; i++)
+    {
+        FormWidget * w = widgets->getWidget(i);
+        double x1, y1, x2, y2;
+
+        w->getRect(&x1, &y1, &x2, &y2);
+        x1 = x1 * param.zoom;
+        x2 = x2 * param.zoom;
+        y1 = y1 * param.zoom;
+        y2 = y2 * param.zoom;
+
+        double width = x2 - x1;
+        double height = y2 - y1;
+        
+        if(w->getType() == formText)
+        {
+            double font_size = height / 2;
+
+            out << "<input id=\"text-" << pageNum << "-" << i 
+                << "\" class=\"" << CSS::INPUT_TEXT_CN 
+                << "\" type=\"text\" value=\"\""
+                << " style=\"position: absolute; left: " << x1 
+                << "px; bottom: " << y1 << "px;" 
+                << " width: " << width << "px; height: " << std::to_string(height) 
+                << "px; line-height: " << std::to_string(height) << "px; font-size: " 
+                << font_size << "px;\" />" << endl;
+        } 
+        else if(w->getType() == formButton)
+        {
+            //Ideally would check w->getButtonType()
+            //for more specific rendering
+            width += 3;
+            height += 3;
+
+            out << "<div id=\"cb-" << pageNum << "-" << i 
+                << "\" class=\"" << CSS::INPUT_RADIO_CN 
+                << "\" style=\"position: absolute; left: " << x1 
+                << "px; bottom: " << y1 << "px;" 
+                << " width: " << width << "px; height: " 
+                << std::to_string(height) << "px; background-size: cover;\" ></div>" << endl;
+        }
+        else 
+        {
+            cerr << "Unsupported form field detected" << endl;
+        }
+    }
+}
+
+}
diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc
new file mode 100644
index 0000000..6a54194
--- /dev/null
+++ b/src/HTMLRenderer/general.cc
@@ -0,0 +1,592 @@
+/*
+ * general.cc
+ *
+ * Handling general stuffs
+ *
+ * Copyright (C) 2012,2013,2014 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#include <cstdio>
+#include <ostream>
+#include <cmath>
+#include <algorithm>
+#include <vector>
+#include <functional>
+
+#include <GlobalParams.h>
+
+#include "pdf2htmlEX-config.h"
+#include "HTMLRenderer.h"
+#include "HTMLTextLine.h"
+#include "Base64Stream.h"
+
+#include "BackgroundRenderer/BackgroundRenderer.h"
+
+#include "util/namespace.h"
+#include "util/ffw.h"
+#include "util/math.h"
+#include "util/path.h"
+#include "util/css_const.h"
+#include "util/encoding.h"
+
+namespace pdf2htmlEX {
+
+using std::fixed;
+using std::flush;
+using std::ostream;
+using std::max;
+using std::min_element;
+using std::vector;
+using std::abs;
+using std::cerr;
+using std::endl;
+
+HTMLRenderer::HTMLRenderer(const Param & param)
+    :OutputDev()
+    ,param(param)
+    ,html_text_page(param, all_manager)
+    ,preprocessor(param)
+    ,tmp_files(param)
+    ,tracer(param)
+{
+    if(!(param.debug))
+    {
+        //disable error messages of poppler
+        globalParams->setErrQuiet(gTrue);
+    }
+
+    ffw_init(param.debug);
+
+    cur_mapping.resize(0x10000);
+    cur_mapping2.resize(0x100);
+    width_list.resize(0x10000);
+
+    /*
+     * For these states, usually the error will not be accumulated
+     * or may be handled well (whitespace_manager)
+     * So we can set a large eps here
+     */
+    all_manager.vertical_align.set_eps(param.v_eps);
+    all_manager.whitespace    .set_eps(param.h_eps);
+    all_manager.left          .set_eps(param.h_eps);
+    /*
+     * For other states, we need accurate values
+     * optimization will be done separately
+     */
+    all_manager.font_size   .set_eps(EPS);
+    all_manager.letter_space.set_eps(EPS);
+    all_manager.word_space  .set_eps(EPS);
+    all_manager.height      .set_eps(EPS);
+    all_manager.width       .set_eps(EPS);
+    all_manager.bottom      .set_eps(EPS);
+
+    tracer.on_char_drawn =
+            [this](double * box) { covered_text_detector.add_char_bbox(box); };
+    tracer.on_char_clipped =
+            [this](double * box, bool partial) { covered_text_detector.add_char_bbox_clipped(box, partial); };
+    tracer.on_non_char_drawn =
+            [this](double * box) { covered_text_detector.add_non_char_bbox(box); };
+}
+
+HTMLRenderer::~HTMLRenderer()
+{
+    ffw_finalize();
+}
+
+void HTMLRenderer::process(PDFDoc *doc)
+{
+    cur_doc = doc;
+    cur_catalog = doc->getCatalog();
+    xref = doc->getXRef();
+
+    pre_process(doc);
+
+    ///////////////////
+    // Process pages
+
+    if(param.process_nontext)
+    {
+        bg_renderer = BackgroundRenderer::getBackgroundRenderer(param.bg_format, this, param);
+        if(!bg_renderer)
+            throw "Cannot initialize background renderer, unsupported format";
+        bg_renderer->init(doc);
+
+        fallback_bg_renderer = BackgroundRenderer::getFallbackBackgroundRenderer(this, param);
+        if (fallback_bg_renderer)
+            fallback_bg_renderer->init(doc);
+    }
+
+    int page_count = (param.last_page - param.first_page + 1);
+    for(int i = param.first_page; i <= param.last_page ; ++i)
+    {
+        if (param.tmp_file_size_limit != -1 && tmp_files.get_total_size() > param.tmp_file_size_limit * 1024) {
+            cerr << "Stop processing, reach max size\n";
+            break;
+        }
+
+        cerr << "Working: " << (i-param.first_page) << "/" << page_count << '\r' << flush;
+
+        if(param.split_pages)
+        {
+            // copy the string out, since we will reuse the buffer soon
+            string filled_template_filename = (char*)str_fmt(param.page_filename.c_str(), i);
+            auto page_fn = str_fmt("%s/%s", param.dest_dir.c_str(), filled_template_filename.c_str());
+            f_curpage = new ofstream((char*)page_fn, ofstream::binary);
+            if(!(*f_curpage))
+                throw string("Cannot open ") + (char*)page_fn + " for writing";
+            set_stream_flags((*f_curpage));
+
+            cur_page_filename = filled_template_filename;
+        }
+
+        doc->displayPage(this, i,
+                text_zoom_factor() * DEFAULT_DPI, text_zoom_factor() * DEFAULT_DPI,
+                0,
+                (!(param.use_cropbox)),
+                true,  // crop
+                false, // printing
+                nullptr, nullptr, nullptr, nullptr);
+
+        if(param.split_pages)
+        {
+            delete f_curpage;
+            f_curpage = nullptr;
+        }
+    }
+    if(page_count >= 0)
+        cerr << "Working: " << page_count << "/" << page_count;
+    cerr << endl;
+
+    ////////////////////////
+    // Process Outline
+    if(param.process_outline)
+        process_outline();
+
+    post_process();
+
+    bg_renderer = nullptr;
+    fallback_bg_renderer = nullptr;
+
+    cerr << endl;
+}
+
+void HTMLRenderer::setDefaultCTM(double *ctm)
+{
+    memcpy(default_ctm, ctm, sizeof(default_ctm));
+}
+
+void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref)
+{
+    covered_text_detector.reset();
+    tracer.reset(state);
+
+    this->pageNum = pageNum;
+
+    html_text_page.set_page_size(state->getPageWidth(), state->getPageHeight());
+
+    reset_state();
+}
+
+void HTMLRenderer::endPage() {
+    long long wid = all_manager.width.install(html_text_page.get_width());
+    long long hid = all_manager.height.install(html_text_page.get_height());
+
+    (*f_curpage)
+        << "<div id=\"" << CSS::PAGE_FRAME_CN << pageNum
+            << "\" class=\"" << CSS::PAGE_FRAME_CN
+            << " " << CSS::WIDTH_CN << wid
+            << " " << CSS::HEIGHT_CN << hid
+            << "\" data-page-no=\"" << pageNum << "\">"
+        << "<div class=\"" << CSS::PAGE_CONTENT_BOX_CN
+            << " " << CSS::PAGE_CONTENT_BOX_CN << pageNum
+            << " " << CSS::WIDTH_CN << wid
+            << " " << CSS::HEIGHT_CN << hid
+            << "\">";
+
+    /*
+     * When split_pages is on, f_curpage points to the current page file
+     * and we want to output empty frames in f_pages.fs
+     */
+    if(param.split_pages)
+    {
+        f_pages.fs
+            << "<div id=\"" << CSS::PAGE_FRAME_CN << pageNum
+                << "\" class=\"" << CSS::PAGE_FRAME_CN
+                << " " << CSS::WIDTH_CN << wid
+                << " " << CSS::HEIGHT_CN << hid
+                << "\" data-page-no=\"" << pageNum
+                << "\" data-page-url=\"";
+
+        writeAttribute(f_pages.fs, cur_page_filename);
+        f_pages.fs << "\">";
+    }
+
+    if(param.process_nontext)
+    {
+        if (bg_renderer->render_page(cur_doc, pageNum))
+        {
+            bg_renderer->embed_image(pageNum);
+        }
+        else if (fallback_bg_renderer)
+        {
+            if (fallback_bg_renderer->render_page(cur_doc, pageNum))
+                fallback_bg_renderer->embed_image(pageNum);
+        }
+    }
+
+    // dump all text
+    html_text_page.dump_text(*f_curpage);
+    html_text_page.dump_css(f_css.fs);
+    html_text_page.clear();
+
+    // process form
+    if(param.process_form)
+        process_form(*f_curpage);
+    
+    // process links before the page is closed
+    cur_doc->processLinks(this, pageNum);
+
+    // close box
+    (*f_curpage) << "</div>";
+
+    // dump info for js
+    // TODO: create a function for this
+    // BE CAREFUL WITH ESCAPES
+    {
+        (*f_curpage) << "<div class=\"" << CSS::PAGE_DATA_CN << "\" data-data='{";
+
+        //default CTM
+        (*f_curpage) << "\"ctm\":[";
+        for(int i = 0; i < 6; ++i)
+        {
+            if(i > 0) (*f_curpage) << ",";
+            (*f_curpage) << round(default_ctm[i]);
+        }
+        (*f_curpage) << "]";
+
+        (*f_curpage) << "}'></div>";
+    }
+
+    // close page
+    (*f_curpage) << "</div>" << endl;
+
+    if(param.split_pages)
+    {
+        f_pages.fs << "</div>" << endl;
+    }
+}
+
+void HTMLRenderer::pre_process(PDFDoc * doc)
+{
+    preprocessor.process(doc);
+
+    /*
+     * determine scale factors
+     */
+    {
+        vector<double> zoom_factors;
+
+        if(is_positive(param.zoom))
+        {
+            zoom_factors.push_back(param.zoom);
+        }
+
+        if(is_positive(param.fit_width))
+        {
+            zoom_factors.push_back((param.fit_width) / preprocessor.get_max_width());
+        }
+
+        if(is_positive(param.fit_height))
+        {
+            zoom_factors.push_back((param.fit_height) / preprocessor.get_max_height());
+        }
+
+        double zoom = (zoom_factors.empty() ? 1.0 : (*min_element(zoom_factors.begin(), zoom_factors.end())));
+
+        text_scale_factor1 = max<double>(zoom, param.font_size_multiplier);
+        text_scale_factor2 = zoom / text_scale_factor1;
+    }
+
+    // we may output utf8 characters, so always use binary
+    {
+        /*
+         * If embed-css
+         * we have to keep the generated css file into a temporary place
+         * and embed it into the main html later
+         *
+         * otherwise
+         * leave it in param.dest_dir
+         */
+
+        auto fn = (param.embed_css)
+            ? str_fmt("%s/__css", param.tmp_dir.c_str())
+            : str_fmt("%s/%s", param.dest_dir.c_str(), param.css_filename.c_str());
+
+        if(param.embed_css)
+            tmp_files.add((char*)fn);
+
+        f_css.path = (char*)fn;
+        f_css.fs.open(f_css.path, ofstream::binary);
+        if(!f_css.fs)
+            throw string("Cannot open ") + (char*)fn + " for writing";
+        set_stream_flags(f_css.fs);
+    }
+
+    if (param.process_outline)
+    {
+        /*
+         * The logic for outline is similar to css
+         */
+
+        auto fn = (param.embed_outline)
+            ? str_fmt("%s/__outline", param.tmp_dir.c_str())
+            : str_fmt("%s/%s", param.dest_dir.c_str(), param.outline_filename.c_str());
+
+        if(param.embed_outline)
+            tmp_files.add((char*)fn);
+
+        f_outline.path = (char*)fn;
+        f_outline.fs.open(f_outline.path, ofstream::binary);
+        if(!f_outline.fs)
+            throw string("Cannot open") + (char*)fn + " for writing";
+
+        // might not be necessary
+        set_stream_flags(f_outline.fs);
+    }
+
+    {
+        /*
+         * we have to keep the html file for pages into a temporary place
+         * because we'll have to embed css before it
+         *
+         * Otherwise just generate it
+         */
+        auto fn = str_fmt("%s/__pages", param.tmp_dir.c_str());
+        tmp_files.add((char*)fn);
+
+        f_pages.path = (char*)fn;
+        f_pages.fs.open(f_pages.path, ofstream::binary);
+        if(!f_pages.fs)
+            throw string("Cannot open ") + (char*)fn + " for writing";
+        set_stream_flags(f_pages.fs);
+    }
+
+    if(param.split_pages)
+    {
+        f_curpage = nullptr;
+    }
+    else
+    {
+        f_curpage = &f_pages.fs;
+    }
+}
+
+void HTMLRenderer::post_process(void)
+{
+    dump_css();
+    
+    // close files if they opened
+    if (param.process_outline)
+    {
+        f_outline.fs.close();
+    }
+    f_pages.fs.close();
+    f_css.fs.close();
+
+    // build the main HTML file
+    ofstream output;
+    {
+        auto fn = str_fmt("%s/%s", param.dest_dir.c_str(), param.output_filename.c_str());
+        output.open((char*)fn, ofstream::binary);
+        if(!output)
+            throw string("Cannot open ") + (char*)fn + " for writing";
+        set_stream_flags(output);
+    }
+
+    // apply manifest
+    ifstream manifest_fin((char*)str_fmt("%s/%s", param.data_dir.c_str(), MANIFEST_FILENAME.c_str()), ifstream::binary);
+    if(!manifest_fin)
+        throw "Cannot open the manifest file";
+
+    bool embed_string = false;
+    string line;
+    long line_no = 0;
+    while(getline(manifest_fin, line))
+    {
+        // trim space at both sides
+        {
+            static const char * whitespaces = " \t\n\v\f\r";
+            auto idx1 = line.find_first_not_of(whitespaces);
+            if(idx1 == string::npos)
+            {
+                line.clear();
+            }
+            else
+            {
+                auto idx2 = line.find_last_not_of(whitespaces);
+                assert(idx2 >= idx1);
+                line = line.substr(idx1, idx2 - idx1 + 1);
+            }
+        }
+
+        ++line_no;
+
+        if(line == "\"\"\"")
+        {
+            embed_string = !embed_string;
+            continue;
+        }
+
+        if(embed_string)
+        {
+            output << line << endl;
+            continue;
+        }
+
+        if(line.empty() || line[0] == '#')
+            continue;
+
+
+        if(line[0] == '@')
+        {
+            embed_file(output, param.data_dir + "/" + line.substr(1), "", true);
+            continue;
+        }
+
+        if(line[0] == '$')
+        {
+            if(line == "$css")
+            {
+                embed_file(output, f_css.path, ".css", false);
+            }
+            else if (line == "$outline")
+            {
+                if (param.process_outline && param.embed_outline)
+                {
+                    ifstream fin(f_outline.path, ifstream::binary);
+                    if(!fin)
+                        throw "Cannot open outline for reading";
+                    output << fin.rdbuf();
+                    output.clear(); // output will set fail big if fin is empty
+                }
+            }
+            else if (line == "$pages")
+            {
+                ifstream fin(f_pages.path, ifstream::binary);
+                if(!fin)
+                    throw "Cannot open pages for reading";
+                output << fin.rdbuf();
+                output.clear(); // output will set fail bit if fin is empty
+            }
+            else
+            {
+                cerr << "Warning: manifest line " << line_no << ": Unknown content \"" << line << "\"" << endl;
+            }
+            continue;
+        }
+
+        cerr << "Warning: unknown line in manifest: " << line << endl;
+    }
+}
+
+void HTMLRenderer::set_stream_flags(std::ostream & out)
+{
+    // we output all ID's in hex
+    // browsers are not happy with scientific notations
+    out << hex << fixed;
+}
+
+void HTMLRenderer::dump_css (void)
+{
+    all_manager.transform_matrix.dump_css(f_css.fs);
+    all_manager.vertical_align  .dump_css(f_css.fs);
+    all_manager.letter_space    .dump_css(f_css.fs);
+    all_manager.stroke_color    .dump_css(f_css.fs);
+    all_manager.word_space      .dump_css(f_css.fs);
+    all_manager.whitespace      .dump_css(f_css.fs);
+    all_manager.fill_color      .dump_css(f_css.fs);
+    all_manager.font_size       .dump_css(f_css.fs);
+    all_manager.bottom          .dump_css(f_css.fs);
+    all_manager.height          .dump_css(f_css.fs);
+    all_manager.width           .dump_css(f_css.fs);
+    all_manager.left            .dump_css(f_css.fs);
+    all_manager.bgimage_size    .dump_css(f_css.fs);
+
+    // print css
+    if(param.printing)
+    {
+        double ps = print_scale();
+        f_css.fs << CSS::PRINT_ONLY << "{" << endl;
+        all_manager.transform_matrix.dump_print_css(f_css.fs, ps);
+        all_manager.vertical_align  .dump_print_css(f_css.fs, ps);
+        all_manager.letter_space    .dump_print_css(f_css.fs, ps);
+        all_manager.stroke_color    .dump_print_css(f_css.fs, ps);
+        all_manager.word_space      .dump_print_css(f_css.fs, ps);
+        all_manager.whitespace      .dump_print_css(f_css.fs, ps);
+        all_manager.fill_color      .dump_print_css(f_css.fs, ps);
+        all_manager.font_size       .dump_print_css(f_css.fs, ps);
+        all_manager.bottom          .dump_print_css(f_css.fs, ps);
+        all_manager.height          .dump_print_css(f_css.fs, ps);
+        all_manager.width           .dump_print_css(f_css.fs, ps);
+        all_manager.left            .dump_print_css(f_css.fs, ps);
+        all_manager.bgimage_size    .dump_print_css(f_css.fs, ps);
+        f_css.fs << "}" << endl;
+    }
+}
+
+void HTMLRenderer::embed_file(ostream & out, const string & path, const string & type, bool copy)
+{
+    string fn = get_filename(path);
+    string suffix = (type == "") ? get_suffix(fn) : type;
+
+    auto iter = EMBED_STRING_MAP.find(suffix);
+    if(iter == EMBED_STRING_MAP.end())
+    {
+        cerr << "Warning: unknown suffix: " << suffix << endl;
+        return;
+    }
+
+    const auto & entry = iter->second;
+
+    if(param.*(entry.embed_flag))
+    {
+        ifstream fin(path, ifstream::binary);
+        if(!fin)
+            throw string("Cannot open file ") + path + " for embedding";
+        out << entry.prefix_embed;
+
+        if(entry.base64_encode)
+        {
+            out << Base64Stream(fin);
+        }
+        else
+        {
+            out << endl << fin.rdbuf();
+        }
+        out.clear(); // out will set fail big if fin is empty
+        out << entry.suffix_embed << endl;
+    }
+    else
+    {
+        out << entry.prefix_external;
+        writeAttribute(out, fn);
+        out << entry.suffix_external << endl;
+
+        if(copy)
+        {
+            ifstream fin(path, ifstream::binary);
+            if(!fin)
+                throw string("Cannot copy file: ") + path;
+            auto out_path = param.dest_dir + "/" + fn;
+            ofstream out(out_path, ofstream::binary);
+            if(!out)
+                throw string("Cannot open file ") + path + " for embedding";
+            out << fin.rdbuf();
+            out.clear(); // out will set fail big if fin is empty
+        }
+    }
+}
+
+const std::string HTMLRenderer::MANIFEST_FILENAME = "manifest";
+
+}// namespace pdf2htmlEX
diff --git a/src/HTMLRenderer/image.cc b/src/HTMLRenderer/image.cc
new file mode 100644
index 0000000..91ca767
--- /dev/null
+++ b/src/HTMLRenderer/image.cc
@@ -0,0 +1,83 @@
+/*
+ * image.cc
+ *
+ * Handling images
+ *
+ * by WangLu
+ * 2012.08.14
+ */
+
+#include "HTMLRenderer.h"
+#include "util/namespace.h"
+
+namespace pdf2htmlEX {
+
+void HTMLRenderer::drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg)
+{
+    tracer.draw_image(state);
+
+    return OutputDev::drawImage(state,ref,str,width,height,colorMap,interpolate,maskColors,inlineImg);
+
+#if 0
+    if(maskColors)
+        return;
+
+    rgb8_image_t img(width, height);
+    auto imgview = view(img);
+    auto loc = imgview.xy_at(0,0);
+
+    ImageStream * img_stream = new ImageStream(str, width, colorMap->getNumPixelComps(), colorMap->getBits());
+    img_stream->reset();
+
+    for(int i = 0; i < height; ++i)
+    {
+        auto p = img_stream->getLine();
+        for(int j = 0; j < width; ++j)
+        {
+            GfxRGB rgb;
+            colorMap->getRGB(p, &rgb);
+
+            *loc = rgb8_pixel_t(colToByte(rgb.r), colToByte(rgb.g), colToByte(rgb.b));
+
+            p += colorMap->getNumPixelComps();
+
+            ++ loc.x();
+        }
+
+        loc = imgview.xy_at(0, i+1);
+    }
+
+    png_write_view((format("i%|1$x|.png")%image_count).str(), imgview);
+    
+    img_stream->close();
+    delete img_stream;
+
+    close_line();
+
+    double ctm[6];
+    memcpy(ctm, state->getCTM(), sizeof(ctm));
+    ctm[4] = ctm[5] = 0.0;
+    html_fout << format("<img class=\"i t%2%\" style=\"left:%3%px;bottom:%4%px;width:%5%px;height:%6%px;\" src=\"i%|1$x|.png\" />") % image_count % install_transform_matrix(ctm) % state->getCurX() % state->getCurY() % width % height << endl;
+
+
+    ++ image_count;
+#endif
+}
+
+void HTMLRenderer::drawSoftMaskedImage(GfxState *state, Object *ref, Stream *str,
+                   int width, int height,
+                   GfxImageColorMap *colorMap,
+                   GBool interpolate,
+                   Stream *maskStr,
+                   int maskWidth, int maskHeight,
+                   GfxImageColorMap *maskColorMap,
+                   GBool maskInterpolate)
+{
+    tracer.draw_image(state);
+
+    return OutputDev::drawSoftMaskedImage(state,ref,str, // TODO really required?
+            width,height,colorMap,interpolate,
+            maskStr, maskWidth, maskHeight, maskColorMap, maskInterpolate);
+}
+
+} // namespace pdf2htmlEX
diff --git a/src/HTMLRenderer/link.cc b/src/HTMLRenderer/link.cc
new file mode 100644
index 0000000..3c90ab5
--- /dev/null
+++ b/src/HTMLRenderer/link.cc
@@ -0,0 +1,309 @@
+/*
+ * link.cc
+ *
+ * Handling links
+ *
+ * by WangLu
+ * 2012.09.25
+ */
+
+#include <iostream>
+#include <sstream>
+#include <algorithm>
+
+#include <Link.h>
+
+#include "HTMLRenderer.h"
+#include "util/namespace.h"
+#include "util/math.h"
+#include "util/misc.h"
+#include "util/encoding.h"
+#include "util/css_const.h"
+
+namespace pdf2htmlEX {
+   
+using std::ostringstream;
+using std::min;
+using std::max;
+using std::cerr;
+using std::endl;
+
+/*
+ * The detailed rectangle area of the link destination
+ * Will be parsed and performed by Javascript
+ * The string will be put into a HTML attribute, surrounded by single quotes
+ * So pay attention to the characters used here
+ */
+static string get_linkdest_detail_str(LinkDest * dest, Catalog * catalog, int & pageno)
+{
+    pageno = 0;
+    if(dest->isPageRef())
+    {
+        auto pageref = dest->getPageRef();
+        pageno = catalog->findPage(pageref.num, pageref.gen);
+    }
+    else
+    {
+        pageno = dest->getPageNum();
+    }
+
+    if(pageno <= 0)
+    {
+        return "";
+    }
+
+    ostringstream sout;
+    // dec
+    sout << "[" << pageno;
+
+    if(dest)
+    {
+        switch(dest->getKind())
+        {
+            case destXYZ:
+                {
+                    sout << ",\"XYZ\",";
+                    if(dest->getChangeLeft())
+                        sout << (dest->getLeft());
+                    else
+                        sout << "null";
+                    sout << ",";
+                    if(dest->getChangeTop())
+                        sout << (dest->getTop());
+                    else
+                        sout << "null";
+                    sout << ",";
+                    if(dest->getChangeZoom())
+                        sout << (dest->getZoom());
+                    else
+                        sout << "null";
+                }
+                break;
+            case destFit:
+                sout << ",\"Fit\"";
+                break;
+            case destFitH:
+                sout << ",\"FitH\",";
+                if(dest->getChangeTop())
+                    sout << (dest->getTop());
+                else
+                    sout << "null";
+                break;
+            case destFitV:
+                sout << ",\"FitV\",";
+                if(dest->getChangeLeft())
+                    sout << (dest->getLeft());
+                else
+                    sout << "null";
+                break;
+            case destFitR:
+                sout << ",\"FitR\","
+                    << (dest->getLeft()) << ","
+                    << (dest->getBottom()) << ","
+                    << (dest->getRight()) << ","
+                    << (dest->getTop());
+                break;
+            case destFitB:
+                sout << ",\"FitB\"";
+                break;
+            case destFitBH:
+                sout << ",\"FitBH\",";
+                if(dest->getChangeTop())
+                    sout << (dest->getTop());
+                else
+                    sout << "null";
+                break;
+            case destFitBV:
+                sout << ",\"FitBV\",";
+                if(dest->getChangeLeft())
+                    sout << (dest->getLeft());
+                else
+                    sout << "null";
+                break;
+            default:
+                break;
+        }
+    }
+    sout << "]";
+
+    return sout.str();
+}
+
+string HTMLRenderer::get_linkaction_str(LinkAction * action, string & detail)
+{
+    string dest_str;
+    detail = "";
+    if(action)
+    {
+        auto kind = action->getKind();
+        switch(kind)
+        {
+            case actionGoTo:
+                {
+                    auto * real_action = dynamic_cast<LinkGoTo*>(action);
+                    LinkDest * dest = nullptr;
+                    if(auto _ = real_action->getDest())
+                        dest = _->copy();
+                    else if (auto _ = real_action->getNamedDest())
+                        dest = cur_catalog->findDest(_);
+                    if(dest)
+                    {
+                        int pageno = 0;
+                        detail = get_linkdest_detail_str(dest, cur_catalog, pageno);
+                        if(pageno > 0)
+                        {
+                            dest_str = (char*)str_fmt("#%s%x", CSS::PAGE_FRAME_CN, pageno);
+                        }
+                        delete dest;
+                    }
+                }
+                break;
+            case actionGoToR:
+                {
+                    cerr << "TODO: actionGoToR is not implemented." << endl;
+                }
+                break;
+            case actionURI:
+                {
+                    auto * real_action = dynamic_cast<LinkURI*>(action);
+                    dest_str = real_action->getURI()->getCString();
+                }
+                break;
+            case actionLaunch:
+                {
+                    cerr << "TODO: actionLaunch is not implemented." << endl;
+                }
+                break;
+            default:
+                cerr << "Warning: unknown annotation type: " << kind << endl;
+                break;
+        }
+    }
+
+    return dest_str;
+}
+    
+/*
+ * Based on pdftohtml from poppler
+ * TODO: share rectangle draw with css-draw
+ */
+void HTMLRenderer::processLink(AnnotLink * al)
+{
+    string dest_detail_str;
+    string dest_str = get_linkaction_str(al->getAction(), dest_detail_str);
+
+    if(!dest_str.empty())
+    {
+        (*f_curpage) << "<a class=\"" << CSS::LINK_CN << "\" href=\"";
+        writeAttribute((*f_curpage), dest_str);
+        (*f_curpage) << "\"";
+
+        if(!dest_detail_str.empty())
+            (*f_curpage) << " data-dest-detail='" << dest_detail_str << "'";
+
+        (*f_curpage) << ">";
+    }
+
+    (*f_curpage) << "<div class=\"" << CSS::CSS_DRAW_CN << ' ' << CSS::TRANSFORM_MATRIX_CN
+        << all_manager.transform_matrix.install(default_ctm)
+        << "\" style=\"";
+
+    double x,y,w,h;
+    double x1, y1, x2, y2;
+    al->getRect(&x1, &y1, &x2, &y2);
+    x = min<double>(x1, x2);
+    y = min<double>(y1, y2);
+    w = max<double>(x1, x2) - x;
+    h = max<double>(y1, y2) - y;
+    
+    double border_width = 0; 
+    double border_top_bottom_width = 0;
+    double border_left_right_width = 0;
+    auto * border = al->getBorder();
+    if(border)
+    {
+        border_width = border->getWidth();
+        if(border_width > 0)
+        {
+            {
+                css_fix_rectangle_border_width(x1, y1, x2, y2, border_width, 
+                        x, y, w, h,
+                        border_top_bottom_width, border_left_right_width);
+
+                if(std::abs(border_top_bottom_width - border_left_right_width) < EPS)
+                    (*f_curpage) << "border-width:" << round(border_top_bottom_width) << "px;";
+                else
+                    (*f_curpage) << "border-width:" << round(border_top_bottom_width) << "px " << round(border_left_right_width) << "px;";
+            }
+            auto style = border->getStyle();
+            switch(style)
+            {
+                case AnnotBorder::borderSolid:
+                    (*f_curpage) << "border-style:solid;";
+                    break;
+                case AnnotBorder::borderDashed:
+                    (*f_curpage) << "border-style:dashed;";
+                    break;
+                case AnnotBorder::borderBeveled:
+                    (*f_curpage) << "border-style:outset;";
+                    break;
+                case AnnotBorder::borderInset:
+                    (*f_curpage) << "border-style:inset;";
+                    break;
+                case AnnotBorder::borderUnderlined:
+                    (*f_curpage) << "border-style:none;border-bottom-style:solid;";
+                    break;
+                default:
+                    cerr << "Warning:Unknown annotation border style: " << style << endl;
+                    (*f_curpage) << "border-style:solid;";
+            }
+
+
+            auto color = al->getColor();
+            double r,g,b;
+            if(color && (color->getSpace() == AnnotColor::colorRGB))
+            {
+                const double * v = color->getValues();
+                r = v[0];
+                g = v[1];
+                b = v[2];
+            }
+            else
+            {
+                r = g = b = 0;
+            }
+
+            (*f_curpage) << "border-color:rgb("
+                << dec << (int)dblToByte(r) << "," << (int)dblToByte(g) << "," << (int)dblToByte(b) << hex
+                << ");";
+        }
+        else
+        {
+            (*f_curpage) << "border-style:none;";
+        }
+    }
+    else
+    {
+        (*f_curpage) << "border-style:none;";
+    }
+
+    tm_transform(default_ctm, x, y);
+
+    (*f_curpage) << "position:absolute;"
+        << "left:" << round(x) << "px;"
+        << "bottom:" << round(y) << "px;"
+        << "width:" << round(w) << "px;"
+        << "height:" << round(h) << "px;";
+
+    // fix for IE
+    (*f_curpage) << "background-color:rgba(255,255,255,0.000001);";
+
+    (*f_curpage) << "\"></div>";
+
+    if(dest_str != "")
+    {
+        (*f_curpage) << "</a>";
+    }
+}
+
+}// namespace pdf2htmlEX
diff --git a/src/HTMLRenderer/outline.cc b/src/HTMLRenderer/outline.cc
new file mode 100644
index 0000000..12c3896
--- /dev/null
+++ b/src/HTMLRenderer/outline.cc
@@ -0,0 +1,74 @@
+/*
+ * outline.cc
+ *
+ * Handling Outline items
+ *
+ * by WangLu
+ * 2013.01.28
+ */
+
+#include <iostream>
+
+#include <Outline.h>
+#include <goo/GooList.h>
+
+#include "HTMLRenderer.h"
+#include "util/namespace.h"
+#include "util/encoding.h"
+#include "util/css_const.h"
+
+namespace pdf2htmlEX {
+
+using std::ostream;
+
+void HTMLRenderer::process_outline_items(GooList * items)
+{
+    if((!items) || (items->getLength() == 0))
+        return;
+
+    f_outline.fs << "<ul>";
+
+    for(int i = 0; i < items->getLength(); ++i)
+    {
+        OutlineItem * item = (OutlineItem*)(items->get(i));
+
+        string detail;
+        string dest = get_linkaction_str(item->getAction(), detail);
+
+        // we don't care dest is empty or not.
+        f_outline.fs << "<li>" << "<a class=\"" << CSS::LINK_CN << "\" href=\"";
+        writeAttribute(f_outline.fs, dest);
+        f_outline.fs << "\"";
+
+        if(!detail.empty())
+            f_outline.fs << " data-dest-detail='" << detail << "'";
+
+        f_outline.fs << ">";
+
+        writeUnicodes(f_outline.fs, item->getTitle(), item->getTitleLength());
+
+        f_outline.fs << "</a>";
+
+        // check kids
+        item->open();
+        if(item->hasKids())
+        {
+            process_outline_items(item->getKids());
+        }
+        item->close();
+        f_outline.fs << "</li>";
+    }
+
+    f_outline.fs << "</ul>";
+}
+   
+void HTMLRenderer::process_outline()
+{
+    Outline * outline = cur_doc->getOutline();
+    if(!outline)
+        return;
+
+    process_outline_items(outline->getItems());
+}
+
+}// namespace pdf2htmlEX
diff --git a/src/HTMLRenderer/state.cc b/src/HTMLRenderer/state.cc
new file mode 100644
index 0000000..f26b17f
--- /dev/null
+++ b/src/HTMLRenderer/state.cc
@@ -0,0 +1,541 @@
+/*
+ * state.cc
+ *
+ * track PDF states
+ *
+ * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#include <cmath>
+#include <algorithm>
+
+#include "HTMLRenderer.h"
+
+#include "util/namespace.h"
+#include "util/math.h"
+
+namespace pdf2htmlEX {
+
+using std::max;
+using std::abs;
+
+void HTMLRenderer::updateAll(GfxState * state) 
+{ 
+    all_changed = true; 
+    updateTextPos(state);
+}
+void HTMLRenderer::updateRise(GfxState * state)
+{
+    rise_changed = true;
+}
+void HTMLRenderer::updateTextPos(GfxState * state) 
+{
+    text_pos_changed = true;
+    cur_tx = state->getLineX(); 
+    cur_ty = state->getLineY(); 
+}
+void HTMLRenderer::updateTextShift(GfxState * state, double shift) 
+{
+    text_pos_changed = true;
+    cur_tx -= shift * 0.001 * state->getFontSize() * state->getHorizScaling(); 
+}
+void HTMLRenderer::updateFont(GfxState * state) 
+{
+    font_changed = true; 
+}
+void HTMLRenderer::updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32) 
+{
+    ctm_changed = true; 
+    tracer.update_ctm(state, m11, m12, m21, m22, m31, m32);
+}
+void HTMLRenderer::updateTextMat(GfxState * state) 
+{
+    text_mat_changed = true; 
+}
+void HTMLRenderer::updateHorizScaling(GfxState * state)
+{
+    hori_scale_changed = true;
+}
+void HTMLRenderer::updateCharSpace(GfxState * state)
+{
+    letter_space_changed = true;
+}
+void HTMLRenderer::updateWordSpace(GfxState * state)
+{
+    word_space_changed = true;
+}
+void HTMLRenderer::updateRender(GfxState * state) 
+{
+    // currently Render is traced for color only
+    // might need something like render_changed later
+    fill_color_changed = true; 
+    stroke_color_changed = true; 
+}
+void HTMLRenderer::updateFillColorSpace(GfxState * state) 
+{
+    fill_color_changed = true; 
+}
+void HTMLRenderer::updateStrokeColorSpace(GfxState * state) 
+{
+    stroke_color_changed = true; 
+}
+void HTMLRenderer::updateFillColor(GfxState * state) 
+{
+    fill_color_changed = true; 
+}
+void HTMLRenderer::updateStrokeColor(GfxState * state) 
+{
+    stroke_color_changed = true; 
+}
+void HTMLRenderer::clip(GfxState * state)
+{
+    clip_changed = true;
+    tracer.clip(state);
+}
+void HTMLRenderer::eoClip(GfxState * state)
+{
+    clip_changed = true;
+    tracer.clip(state, true);
+}
+void HTMLRenderer::clipToStrokePath(GfxState * state)
+{
+    clip_changed = true;
+    tracer.clip_to_stroke_path(state);
+}
+void HTMLRenderer::reset_state()
+{
+    draw_text_scale = 1.0;
+
+    cur_font_size = 0.0;
+    
+    memcpy(cur_text_tm, ID_MATRIX, sizeof(cur_text_tm));
+
+    // reset html_state
+    cur_text_state.font_info = install_font(nullptr);
+    cur_text_state.font_size = 0;
+    cur_text_state.fill_color.transparent = true;
+    cur_text_state.stroke_color.transparent = true;
+    cur_text_state.letter_space = 0;
+    cur_text_state.word_space = 0;
+    cur_text_state.vertical_align = 0;
+
+    cur_line_state.x = 0;
+    cur_line_state.y = 0;
+    memcpy(cur_line_state.transform_matrix, ID_MATRIX, sizeof(cur_line_state.transform_matrix));
+
+    cur_line_state.is_char_covered = [this](int index) { return is_char_covered(index);};
+
+    cur_clip_state.xmin = 0;
+    cur_clip_state.xmax = 0;
+    cur_clip_state.ymin = 0;
+    cur_clip_state.ymax = 0;
+
+    cur_tx  = cur_ty  = 0;
+    draw_tx = draw_ty = 0;
+
+    reset_state_change();
+    all_changed = true;
+}
+void HTMLRenderer::reset_state_change()
+{
+    all_changed = false;
+
+    rise_changed = false;
+    text_pos_changed = false;
+
+    font_changed = false;
+    ctm_changed = false;
+    text_mat_changed = false;
+    hori_scale_changed = false;
+
+    letter_space_changed = false;
+    word_space_changed = false;
+
+    fill_color_changed = false;
+    stroke_color_changed = false;
+
+    clip_changed = false;
+}
+
+template<class NewLineState>
+void set_line_state(NewLineState & cur_ls, NewLineState new_ls)
+{
+    if(new_ls > cur_ls)
+        cur_ls = new_ls;
+}
+
+void HTMLRenderer::check_state_change(GfxState * state)
+{
+    // DEPENDENCY WARNING
+    // don't adjust the order of state checking 
+    
+    new_line_state = NLS_NONE;
+
+    if(all_changed || clip_changed)
+    {
+        HTMLClipState new_clip_state;
+        state->getClipBBox(&new_clip_state.xmin, &new_clip_state.ymin, &new_clip_state.xmax, &new_clip_state.ymax);
+        if(!(equal(cur_clip_state.xmin, new_clip_state.xmin)
+                    && equal(cur_clip_state.xmax, new_clip_state.xmax)
+                    && equal(cur_clip_state.ymin, new_clip_state.ymin)
+                    && equal(cur_clip_state.ymax, new_clip_state.ymax)))
+        {
+            cur_clip_state = new_clip_state;
+            set_line_state(new_line_state, NLS_NEWCLIP);
+        }
+    }
+
+    bool need_recheck_position = false;
+    bool need_rescale_font = false;
+    bool draw_text_scale_changed = false;
+
+    // save current info for later use
+    auto old_text_state = cur_text_state;
+    auto old_line_state = cur_line_state;
+    double old_tm[6];
+    memcpy(old_tm, cur_text_tm, sizeof(old_tm));
+    double old_draw_text_scale = draw_text_scale;
+
+    // text position
+    // we've been tracking the text position positively in the update*** functions
+    if(all_changed || text_pos_changed)
+    {
+        need_recheck_position = true;
+    }
+
+    // font name & size
+    if(all_changed || font_changed)
+    {
+        const FontInfo * new_font_info = install_font(state->getFont());
+
+        if(!(new_font_info->id == cur_text_state.font_info->id))
+        {
+            // The width of the type 3 font text, if shown, is likely to be wrong
+            // So we will create separate (absolute positioned) blocks for them, such that it won't affect other text
+            if((new_font_info->is_type3 || cur_text_state.font_info->is_type3) && (!param.process_type3))
+            {
+                set_line_state(new_line_state, NLS_NEWLINE);
+            }
+            else
+            {
+                set_line_state(new_line_state, NLS_NEWSTATE);
+            }
+            cur_text_state.font_info = new_font_info;
+        }
+
+        /*
+         * For Type 3 fonts, we need to take type3_font_size_scale into consideration
+         */
+        if((new_font_info->is_type3 || cur_text_state.font_info->is_type3) && param.process_type3)
+            need_rescale_font = true;
+
+        double new_font_size = state->getFontSize();
+        if(!equal(cur_font_size, new_font_size))
+        {
+            need_rescale_font = true;
+            cur_font_size = new_font_size;
+        }
+    }  
+
+    // ctm & text ctm & hori scale & rise
+    if(all_changed || ctm_changed || text_mat_changed || hori_scale_changed || rise_changed)
+    {
+        double new_text_tm[6];
+
+        double m1[6];
+        double m2[6];
+
+        //the matrix with horizontal_scale and rise
+        m1[0] = state->getHorizScaling();
+        m1[3] = 1;
+        m1[5] = state->getRise();
+        m1[1] = m1[2] = m1[4] = 0;
+
+        tm_multiply(m2, state->getCTM(), state->getTextMat()); 
+        tm_multiply(new_text_tm, m2, m1);
+
+        if(!tm_equal(new_text_tm, cur_text_tm))
+        {
+            need_recheck_position = true;
+            need_rescale_font = true;
+            memcpy(cur_text_tm, new_text_tm, sizeof(cur_text_tm));
+        }
+    }
+
+    // draw_text_tm, draw_text_scale
+    // depends: font size & ctm & text_ctm & hori scale & rise
+    if(need_rescale_font)
+    {
+        /*
+         * Rescale the font
+         * If the font-size is 1, and the matrix is [10,0,0,10,0,0], we would like to change it to
+         * font-size == 10 and matrix == [1,0,0,1,0,0], 
+         * such that it will be easy and natural for web browsers
+         */
+        double new_draw_text_tm[6];
+        memcpy(new_draw_text_tm, cur_text_tm, sizeof(new_draw_text_tm));
+
+        // see how the tm (together with text_scale_factor2) would change the vector (0,1)
+        double new_draw_text_scale = 1.0/text_scale_factor2 * hypot(new_draw_text_tm[2], new_draw_text_tm[3]);
+
+        double new_draw_font_size = cur_font_size;
+
+        if(is_positive(new_draw_text_scale))
+        {
+            // scale both font size and matrix 
+            new_draw_font_size *= new_draw_text_scale;
+            for(int i = 0; i < 4; ++i)
+                new_draw_text_tm[i] /= new_draw_text_scale;
+        }
+        else
+        {
+            new_draw_text_scale = 1.0;
+        }
+
+        if(is_positive(-new_draw_font_size))
+        {
+            // CSS cannot handle flipped pages
+            new_draw_font_size *= -1;
+
+            for(int i = 0; i < 4; ++i)
+                new_draw_text_tm[i] *= -1;
+        }
+
+        if(!(equal(new_draw_text_scale, draw_text_scale)))
+        {
+            draw_text_scale_changed = true;
+            draw_text_scale = new_draw_text_scale;
+        }
+
+        if(!equal(new_draw_font_size, cur_text_state.font_size))
+        {
+            set_line_state(new_line_state, NLS_NEWSTATE);
+            cur_text_state.font_size = new_draw_font_size;
+        }
+
+        if(!tm_equal(new_draw_text_tm, cur_line_state.transform_matrix, 4))
+        {
+            set_line_state(new_line_state, NLS_NEWLINE);
+            memcpy(cur_line_state.transform_matrix, new_draw_text_tm, sizeof(cur_line_state.transform_matrix));
+        }
+    }
+
+    // see if the new line is compatible with the current line with proper position shift
+    // don't bother doing the heavy job when (new_line_state == NLS_NEWLINE)
+    // depends: text position & transformation
+    if(need_recheck_position && (new_line_state < NLS_NEWLINE))
+    {
+        // TM[4] and/or TM[5] have been changed
+        // To find an offset (dx,dy), which would cancel the effect
+        /*
+         * CurTM * (cur_tx, cur_ty, 1)^T = OldTM * (draw_tx + dx, draw_ty + dy, 1)^T
+         *
+         * the first 4 elements of CurTM and OldTM should be the proportional
+         * otherwise the following text cannot be parallel
+         *
+         * NOTE:
+         * dx,dy are handled by the old state. so they should be multiplied by old_draw_text_scale
+         */
+
+        bool merged = false;
+        double dx = 0;
+        double dy = 0;
+        if(tm_equal(old_line_state.transform_matrix, cur_line_state.transform_matrix, 4))
+        {
+            double det = old_tm[0] * old_tm[3] - old_tm[1] * old_tm[2];
+            if(!equal(det, 0))
+            {
+                double lhs1 = cur_text_tm[0] * cur_tx + cur_text_tm[2] * cur_ty + cur_text_tm[4] - old_tm[0] * draw_tx - old_tm[2] * draw_ty - old_tm[4];
+                double lhs2 = cur_text_tm[1] * cur_tx + cur_text_tm[3] * cur_ty + cur_text_tm[5] - old_tm[1] * draw_tx - old_tm[3] * draw_ty - old_tm[5];
+                /*
+                 * Now the equation system becomes
+                 *
+                 * lhs1 = OldTM[0] * dx + OldTM[2] * dy
+                 * lhs2 = OldTM[1] * dx + OldTM[3] * dy
+                 */
+
+                double inverted[4];
+                inverted[0] =  old_tm[3] / det;
+                inverted[1] = -old_tm[1] / det;
+                inverted[2] = -old_tm[2] / det;
+                inverted[3] =  old_tm[0] / det;
+                dx = inverted[0] * lhs1 + inverted[2] * lhs2;
+                dy = inverted[1] * lhs1 + inverted[3] * lhs2;
+                if(equal(dy, 0))
+                {
+                    // text on a same horizontal line, we can insert positive or negative x-offsets
+                    merged = true;
+                }
+                else if(param.optimize_text)
+                {
+                    // otherwise we merge the lines only when
+                    // - text are not shifted to the left too much
+                    // - text are not moved too high or too low
+                    if((dx * old_draw_text_scale) >= -param.space_threshold * old_text_state.em_size() - EPS)
+                    {
+                        double oldymin = old_text_state.font_info->descent * old_text_state.font_size;
+                        double oldymax = old_text_state.font_info->ascent * old_text_state.font_size;
+                        double ymin = dy * old_draw_text_scale + cur_text_state.font_info->descent * cur_text_state.font_size;
+                        double ymax = dy * old_draw_text_scale + cur_text_state.font_info->ascent * cur_text_state.font_size;
+                        if((ymin <= oldymax + EPS) && (ymax >= oldymin - EPS))
+                        {
+                            merged = true;
+                        }
+                    }
+                }
+            }
+            //else no solution
+        }
+        // else: different rotation: force new line
+
+        if(merged && !equal(state->getHorizScaling(), 0))
+        {
+            html_text_page.get_cur_line()->append_offset(dx * old_draw_text_scale / state->getHorizScaling());
+            if(equal(dy, 0))
+            {
+                cur_text_state.vertical_align = 0;
+            }
+            else
+            {
+                cur_text_state.vertical_align = (dy * old_draw_text_scale);
+                set_line_state(new_line_state, NLS_NEWSTATE);
+            }
+            draw_tx = cur_tx;
+            draw_ty = cur_ty;
+        }
+        else
+        {
+            set_line_state(new_line_state, NLS_NEWLINE);
+        }
+    }
+    else
+    {
+        // no vertical shift if no need to check position
+        cur_text_state.vertical_align = 0;
+    }
+
+    // letter space
+    // depends: draw_text_scale
+    if(all_changed || letter_space_changed || draw_text_scale_changed)
+    {
+        double new_letter_space = state->getCharSpace() * draw_text_scale;
+        if(!equal(new_letter_space, cur_text_state.letter_space))
+        {
+            cur_text_state.letter_space = new_letter_space;
+            set_line_state(new_line_state, NLS_NEWSTATE);
+        }
+    }
+
+    // word space
+    // depends draw_text_scale
+    if(all_changed || word_space_changed || draw_text_scale_changed)
+    {
+        double new_word_space = state->getWordSpace() * draw_text_scale;
+        if(!equal(new_word_space, cur_text_state.word_space))
+        {
+            cur_text_state.word_space = new_word_space;
+            set_line_state(new_line_state, NLS_NEWSTATE);
+        }
+    }
+
+    // fill color
+    if((!(param.fallback)) && (all_changed || fill_color_changed))
+    {
+        // * PDF Spec. Table 106 –Text rendering modes
+        static const char FILL[8] = { true, false, true, false, true, false, true, false };
+        
+        int idx = state->getRender();
+        assert((idx >= 0) && (idx < 8));
+        Color new_fill_color;
+        if(FILL[idx])
+        {
+            new_fill_color.transparent = false;
+            state->getFillRGB(&new_fill_color.rgb);
+        }
+        else
+        {
+            new_fill_color.transparent = true;
+        }
+        if(!(new_fill_color == cur_text_state.fill_color))
+        {
+            cur_text_state.fill_color = new_fill_color;
+            set_line_state(new_line_state, NLS_NEWSTATE);
+        }
+    }
+
+    // stroke color
+    if((!(param.fallback)) && (all_changed || stroke_color_changed))
+    {
+        // * PDF Spec. Table 106 –  Text rendering modes
+        static const char STROKE[8] = { false, true, true, false, false, true, true, false };
+        
+        int idx = state->getRender();
+        assert((idx >= 0) && (idx < 8));
+        Color new_stroke_color;
+        // stroke
+        if(STROKE[idx])
+        {
+            new_stroke_color.transparent = false;
+            state->getStrokeRGB(&new_stroke_color.rgb);
+        }
+        else
+        {
+            new_stroke_color.transparent = true;
+        }
+        if(!(new_stroke_color == cur_text_state.stroke_color))
+        {
+            cur_text_state.stroke_color = new_stroke_color;
+            set_line_state(new_line_state, NLS_NEWSTATE);
+        }
+    }
+
+    reset_state_change();
+}
+
+void HTMLRenderer::prepare_text_line(GfxState * state)
+{
+    if(!(html_text_page.get_cur_line()))
+        new_line_state = NLS_NEWCLIP;
+
+    if(new_line_state >= NLS_NEWCLIP)
+    {
+        html_text_page.clip(cur_clip_state);
+    }
+    
+    if(new_line_state >= NLS_NEWLINE)
+    {
+        // update position such that they will be recorded by text_line_buf
+        double rise_x, rise_y;
+        state->textTransformDelta(0, state->getRise(), &rise_x, &rise_y);
+        state->transform(state->getCurX() + rise_x, state->getCurY() + rise_y, &cur_line_state.x, &cur_line_state.y);
+
+        if (param.correct_text_visibility)
+            cur_line_state.first_char_index = get_char_count();
+
+        html_text_page.open_new_line(cur_line_state);
+
+        cur_text_state.vertical_align = 0;
+
+        //resync position
+        draw_ty = cur_ty;
+        draw_tx = cur_tx;
+    }
+    else
+    {
+        // align horizontal position
+        // try to merge with the last line if possible
+        double target = (cur_tx - draw_tx) * draw_text_scale;
+        if(!equal(target, 0))
+        {
+            html_text_page.get_cur_line()->append_offset(target);
+            draw_tx += target / draw_text_scale;
+        }
+    }
+
+    if(new_line_state != NLS_NONE)
+    {
+        html_text_page.get_cur_line()->append_state(cur_text_state);
+    }
+}
+
+} //namespace pdf2htmlEX
diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc
new file mode 100644
index 0000000..e58a17a
--- /dev/null
+++ b/src/HTMLRenderer/text.cc
@@ -0,0 +1,166 @@
+/*
+ * text.cc
+ *
+ * Handling text & font, and relative stuffs
+ *
+ * Copyright (C) 2012 Lu Wang <coolwanglu@gmail.com>
+ */
+
+
+#include <algorithm>
+
+#include "HTMLRenderer.h"
+
+#include "util/namespace.h"
+#include "util/unicode.h"
+
+//#define HR_DEBUG(x)  (x)
+#define HR_DEBUG(x)
+
+namespace pdf2htmlEX {
+
+using std::none_of;
+using std::cerr;
+using std::endl;
+
+void HTMLRenderer::drawString(GfxState * state, GooString * s)
+{
+    if(s->getLength() == 0)
+        return;
+
+    auto font = state->getFont();
+    double cur_letter_space = state->getCharSpace();
+    double cur_word_space   = state->getWordSpace();
+    double cur_horiz_scaling = state->getHorizScaling();
+
+
+    // Writing mode fonts and Type 3 fonts are rendered as images
+    // I don't find a way to display writing mode fonts in HTML except for one div for each character, which is too costly
+    // For type 3 fonts, due to the font matrix, still it's hard to show it on HTML
+    if( (font == nullptr) 
+        || (font->getWMode())
+        || ((font->getType() == fontType3) && (!param.process_type3))
+      )
+    {
+        return;
+    }
+
+    // see if the line has to be closed due to state change
+    check_state_change(state);
+    prepare_text_line(state);
+
+    // Now ready to output
+    // get the unicodes
+    char *p = s->getCString();
+    int len = s->getLength();
+
+    //accumulated displacement of chars in this string, in text object space
+    double dx = 0;
+    double dy = 0;
+    //displacement of current char, in text object space, including letter space but not word space.
+    double ddx, ddy;
+    //advance of current char, in glyph space
+    double ax, ay;
+    //origin of current char, in glyph space
+    double ox, oy;
+
+    int uLen;
+
+    CharCode code;
+    Unicode *u = nullptr;
+
+    HR_DEBUG(printf("HTMLRenderer::drawString:len=%d\n", len));
+
+    while (len > 0) 
+    {
+        auto n = font->getNextChar(p, len, &code, &u, &uLen, &ax, &ay, &ox, &oy);
+        HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%lc(%d)\n", (wchar_t)u[0], u[0]));
+
+        if(!(equal(ox, 0) && equal(oy, 0)))
+        {
+            cerr << "TODO: non-zero origins" << endl;
+        }
+        ddx = ax * cur_font_size + cur_letter_space;
+        ddy = ay * cur_font_size;
+        tracer.draw_char(state, dx, dy, ax, ay);
+
+        bool is_space = false;
+        if (n == 1 && *p == ' ') 
+        {
+            /*
+             * This is by standard
+             * however some PDF will use ' ' as a normal encoding slot
+             * such that it will be mapped to other unicodes
+             * In that case, when space_as_offset is on, we will simply ignore that character...
+             *
+             * Checking mapped unicode may or may not work
+             * There are always ugly PDF files with no useful info at all.
+             */
+            is_space = true;
+        }
+        
+        if(is_space && (param.space_as_offset))
+        {
+            html_text_page.get_cur_line()->append_padding_char();
+            // ignore horiz_scaling, as it has been merged into CTM
+            html_text_page.get_cur_line()->append_offset((ax * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale);
+        }
+        else
+        {
+            if((param.decompose_ligature) && (uLen > 1) && none_of(u, u+uLen, is_illegal_unicode))
+            {
+                html_text_page.get_cur_line()->append_unicodes(u, uLen, ddx);
+            }
+            else
+            {
+                Unicode uu;
+                if(cur_text_state.font_info->use_tounicode)
+                {
+                    uu = check_unicode(u, uLen, code, font);
+                }
+                else
+                {
+                    uu = unicode_from_font(code, font);
+                }
+                html_text_page.get_cur_line()->append_unicodes(&uu, 1, ddx);
+                /*
+                 * In PDF, word_space is appended if (n == 1 and *p = ' ')
+                 * but in HTML, word_space is appended if (uu == ' ')
+                 */
+                int space_count = (is_space ? 1 : 0) - ((uu == ' ') ? 1 : 0);
+                if(space_count != 0)
+                {
+                    html_text_page.get_cur_line()->append_offset(cur_word_space * draw_text_scale * space_count);
+                }
+            }
+        }
+
+        dx += ddx * cur_horiz_scaling;
+        dy += ddy;
+        if (is_space)
+            dx += cur_word_space * cur_horiz_scaling;
+
+        p += n;
+        len -= n;
+    }
+
+    cur_tx += dx;
+    cur_ty += dy;
+        
+    draw_tx += dx;
+    draw_ty += dy;
+}
+
+bool HTMLRenderer::is_char_covered(int index)
+{
+    auto covered = covered_text_detector.get_chars_covered();
+    if (index < 0 || index >= (int)covered.size())
+    {
+        std::cerr << "Warning: HTMLRenderer::is_char_covered: index out of bound: "
+                << index << ", size: " << covered.size() <<endl;
+        return false;
+    }
+    return covered[index];
+}
+
+} // namespace pdf2htmlEX
diff --git a/src/HTMLState.h b/src/HTMLState.h
new file mode 100644
index 0000000..ef7e29f
--- /dev/null
+++ b/src/HTMLState.h
@@ -0,0 +1,82 @@
+/*
+ * Header file for HTMLState
+ * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com>
+ */
+#ifndef HTMLSTATE_H__
+#define HTMLSTATE_H__
+
+#include <functional>
+
+#include "Color.h"
+
+namespace pdf2htmlEX {
+
+struct FontInfo
+{
+    long long id;
+    bool use_tounicode;
+    int em_size;
+    double space_width;
+    double ascent, descent;
+    bool is_type3;
+    /*
+     * As Type 3 fonts have a font matrix
+     * a glyph of 1pt can be very large or very small
+     * however it might not be true for other font formats such as ttf
+     *
+     * Therefore when we save a Type 3 font into ttf,
+     * we have to scale the font to about 1,
+     * then apply the scaling when using the font
+     *
+     * The scaling factor is stored as font_size_scale
+     *
+     * The value is 1 for other fonts
+     */
+    double font_size_scale;
+};
+
+struct HTMLTextState
+{
+    const FontInfo * font_info;
+    double font_size;
+    Color fill_color;
+    Color stroke_color;
+    double letter_space;
+    double word_space;
+    
+    // relative to the previous state
+    double vertical_align;
+    
+    // the offset cause by a single ' ' char
+    double single_space_offset(void) const {
+        double offset = word_space + letter_space;
+        if(font_info->em_size != 0)
+            offset += font_info->space_width * font_size;
+        return offset;
+    }
+    // calculate em_size of this state
+    double em_size(void) const {
+        return font_size * (font_info->ascent - font_info->descent);
+    }
+};
+
+struct HTMLLineState
+{
+    double x,y;
+    double transform_matrix[4];
+    // The page-cope char index(in drawing order) of the first char in this line.
+    int first_char_index;
+    // A function to determine whether a char is covered at a given index.
+    std::function<bool(int)> is_char_covered;
+
+    HTMLLineState(): first_char_index(-1) { }
+};
+
+struct HTMLClipState
+{
+    double xmin, xmax, ymin, ymax;
+};
+
+} // namespace pdf2htmlEX 
+
+#endif //HTMLSTATE_H__
diff --git a/src/HTMLTextLine.cc b/src/HTMLTextLine.cc
new file mode 100644
index 0000000..a0be286
--- /dev/null
+++ b/src/HTMLTextLine.cc
@@ -0,0 +1,734 @@
+/*
+ * HTMLTextLine.cc
+ *
+ * Generate and optimized HTML for one line
+ *
+ * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#include <cmath>
+#include <algorithm>
+
+#include "HTMLTextLine.h"
+
+#include "util/encoding.h"
+#include "util/css_const.h"
+
+namespace pdf2htmlEX {
+
+using std::min;
+using std::max;
+using std::vector;
+using std::ostream;
+using std::cerr;
+using std::endl;
+using std::find;
+using std::abs;
+
+HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & param, AllStateManager & all_manager) 
+    :param(param)
+    ,all_manager(all_manager) 
+    ,line_state(line_state)
+    ,clip_x1(0)
+    ,clip_y1(0)
+    ,width(0)
+{ }
+
+void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width)
+{
+    if (l == 1) 
+        text.push_back(min(u[0], (unsigned)INT_MAX));
+    else if (l > 1)
+    {
+        text.push_back(- decomposed_text.size() - 1);
+        decomposed_text.emplace_back();
+        decomposed_text.back().assign(u, u + l);
+    }
+    this->width += width;
+}
+
+void HTMLTextLine::append_offset(double width)
+{
+    /*
+     * If the last offset is very thin, we can ignore it and directly use it
+     * But this should not happen often, and we will also filter near-zero offsets when outputting them
+     * So don't check it.
+     *
+     * Offset must be appended immediately after the last real (non-padding) char, or the text optimizing
+     * algorithm may be confused: it may wrongly convert offsets at the beginning of a line to word-space.
+     */
+
+    auto offset_idx = text.size();
+    while (offset_idx > 0 && text[offset_idx - 1] == 0)
+        --offset_idx;
+    if((!offsets.empty()) && (offsets.back().start_idx == offset_idx))
+        offsets.back().width += width;
+    else
+        offsets.emplace_back(offset_idx, width);
+    this->width += width;
+}
+
+void HTMLTextLine::append_state(const HTMLTextState & text_state)
+{
+    if(states.empty() || (states.back().start_idx != text.size()))
+    {
+        states.emplace_back();
+        states.back().start_idx = text.size();
+        states.back().hash_umask = 0;
+    }
+
+    HTMLTextState & last_state = states.back();
+    last_state = text_state;
+    //apply font scale
+    last_state.font_size *= last_state.font_info->font_size_scale;
+}
+
+void HTMLTextLine::dump_char(std::ostream & out, int pos)
+{
+    int c = text[pos];
+    if (c > 0)
+    {
+        Unicode u = c;
+        writeUnicodes(out, &u, 1);
+    }
+    else if (c < 0)
+    {
+        auto dt = decomposed_text[- c - 1];
+        writeUnicodes(out, &dt.front(), dt.size());
+    }
+}
+
+void HTMLTextLine::dump_chars(ostream & out, int begin, int len)
+{
+    static const Color transparent(0, 0, 0, true);
+
+    if (line_state.first_char_index < 0)
+    {
+        for (int i = 0; i < len; i++)
+            dump_char(out, begin + i);
+        return;
+    }
+
+    bool invisible_group_open = false;
+    for(int i = 0; i < len; i++)
+    {
+        if (!line_state.is_char_covered(line_state.first_char_index + begin + i)) //visible
+        {
+            if (invisible_group_open)
+            {
+                invisible_group_open = false;
+                out << "</span>";
+            }
+            dump_char(out, begin + i);
+        }
+        else
+        {
+            if (!invisible_group_open)
+            {
+                out << "<span class=\"" << all_manager.fill_color.get_css_class_name()
+                    << all_manager.fill_color.install(transparent) << " " << all_manager.stroke_color.get_css_class_name()
+                    << all_manager.stroke_color.install(transparent) << "\">";
+                invisible_group_open = true;
+            }
+            dump_char(out, begin + i);
+        }
+    }
+    if (invisible_group_open)
+        out << "</span>";
+}
+
+void HTMLTextLine::dump_text(ostream & out)
+{
+    /*
+     * Each Line is an independent absolute positioned block
+     * so even we have a few states or offsets, we may omit them
+     */
+    if(text.empty())
+        return;
+
+    if(states.empty() || (states[0].start_idx != 0))
+    {
+        cerr << "Warning: text without a style! Must be a bug in pdf2htmlEX" << endl;
+        return;
+    }
+
+    // Start Output
+    {
+        // open <div> for the current text line
+        out << "<div class=\"" << CSS::LINE_CN
+            << " " << CSS::TRANSFORM_MATRIX_CN << all_manager.transform_matrix.install(line_state.transform_matrix)
+            << " " << CSS::LEFT_CN             << all_manager.left.install(line_state.x - clip_x1)
+            << " " << CSS::HEIGHT_CN           << all_manager.height.install(ascent)
+            << " " << CSS::BOTTOM_CN           << all_manager.bottom.install(line_state.y - clip_y1)
+            ;
+        // it will be closed by the first state
+    }
+
+    std::vector<State*> stack;
+    // a special safeguard in the bottom
+    stack.push_back(nullptr);
+
+    //accumulated horizontal offset;
+    double dx = 0;
+
+    // whenever a negative offset appears, we should not pop out that <span>
+    // otherwise the effect of negative margin-left would disappear
+    size_t last_text_pos_with_negative_offset = 0;
+    size_t cur_text_idx = 0;
+
+    auto cur_offset_iter = offsets.begin();
+    for(auto state_iter2 = states.begin(), state_iter1 = state_iter2++; 
+            state_iter1 != states.end(); 
+            ++state_iter1, ++state_iter2)
+    {
+        // export current state, find a closest parent
+        { 
+            // greedy
+            double vertical_align = state_iter1->vertical_align;
+            int best_cost = State::HASH_ID_COUNT + 1;
+            // we have a nullptr at the beginning, so no need to check for rend
+            for(auto iter = stack.rbegin(); *iter; ++iter)
+            {
+                int cost = state_iter1->diff(**iter);
+                if(!equal(vertical_align,0))
+                    ++cost;
+
+                if(cost < best_cost)
+                {
+                    while(stack.back() != *iter)
+                    {
+                        stack.back()->end(out);
+                        stack.pop_back();
+                    }
+                    best_cost = cost;
+                    state_iter1->vertical_align = vertical_align;
+
+                    if(best_cost == 0)
+                        break;
+                }
+
+                // cannot go further
+                if((*iter)->start_idx <= last_text_pos_with_negative_offset)
+                    break;
+
+                vertical_align += (*iter)->vertical_align;
+            }
+            // 
+            state_iter1->ids[State::VERTICAL_ALIGN_ID] = all_manager.vertical_align.install(state_iter1->vertical_align);
+            // export the diff between *state_iter1 and stack.back()
+            state_iter1->begin(out, stack.back());
+            stack.push_back(&*state_iter1);
+        }
+
+        // [state_iter1->start_idx, text_idx2) are covered by the current state
+        size_t text_idx2 = (state_iter2 == states.end()) ? text.size() : state_iter2->start_idx;
+
+        // dump all text and offsets before next state
+        while(true)
+        {
+            if((cur_offset_iter != offsets.end()) 
+                    && (cur_offset_iter->start_idx <= cur_text_idx))
+            {
+                if(cur_offset_iter->start_idx > text_idx2)
+                    break;
+                // next is offset
+                double target = cur_offset_iter->width + dx;
+                double actual_offset = 0;
+
+                //ignore near-zero offsets
+                if(std::abs(target) <= param.h_eps)
+                {
+                    actual_offset = 0;
+                }
+                else
+                {
+                    bool done = false;
+                    // check if the offset is equivalent to a single ' '
+                    if(!(state_iter1->hash_umask & State::umask_by_id(State::WORD_SPACE_ID)))
+                    {
+                        double space_off = state_iter1->single_space_offset();
+                        if(std::abs(target - space_off) <= param.h_eps)
+                        {
+                            Unicode u = ' ';
+                            writeUnicodes(out, &u, 1);
+                            actual_offset = space_off;
+                            done = true;
+                        }
+                    }
+
+                    // finally, just dump it
+                    if(!done)
+                    {
+                        long long wid = all_manager.whitespace.install(target, &actual_offset);
+
+                        if(!equal(actual_offset, 0))
+                        {
+                            if(is_positive(-actual_offset))
+                                last_text_pos_with_negative_offset = cur_text_idx;
+
+                            double threshold = state_iter1->em_size() * (param.space_threshold);
+
+                            out << "<span class=\"" << CSS::WHITESPACE_CN
+                                << ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>";
+                        }
+                    }
+                }
+                dx = target - actual_offset;
+                ++ cur_offset_iter;
+            }
+            else
+            {
+                if(cur_text_idx >= text_idx2)
+                    break;
+                // next is text
+                size_t next_text_idx = text_idx2;
+                if((cur_offset_iter != offsets.end()) && (cur_offset_iter->start_idx) < next_text_idx)
+                    next_text_idx = cur_offset_iter->start_idx;
+                dump_chars(out, cur_text_idx, next_text_idx - cur_text_idx);
+                cur_text_idx = next_text_idx;
+            }
+        }
+    }
+
+    // we have a nullptr in the bottom
+    while(stack.back())
+    {
+        stack.back()->end(out);
+        stack.pop_back();
+    }
+
+    out << "</div>";
+}
+
+void HTMLTextLine::clear(void)
+{
+    states.clear();
+    offsets.clear();
+    text.clear();
+}
+
+void HTMLTextLine::clip(const HTMLClipState & clip_state)
+{
+    clip_x1 = clip_state.xmin;
+    clip_y1 = clip_state.ymin;
+}
+
+void HTMLTextLine::prepare(void)
+{
+    // max_ascent determines the height of the div
+    double accum_vertical_align = 0; // accumulated
+    ascent = 0;
+    descent = 0;
+    // note that vertical_align cannot be calculated here
+    for(auto iter = states.begin(); iter != states.end(); ++iter)
+    {
+        auto font_info = iter->font_info;
+        iter->ids[State::FONT_ID] = font_info->id;
+        iter->ids[State::FONT_SIZE_ID]      = all_manager.font_size.install(iter->font_size);
+        iter->ids[State::FILL_COLOR_ID]     = all_manager.fill_color.install(iter->fill_color);
+        iter->ids[State::STROKE_COLOR_ID]   = all_manager.stroke_color.install(iter->stroke_color);
+        iter->ids[State::LETTER_SPACE_ID]   = all_manager.letter_space.install(iter->letter_space);
+        iter->ids[State::WORD_SPACE_ID]     = all_manager.word_space.install(iter->word_space);
+        iter->hash();
+
+        accum_vertical_align += iter->vertical_align;
+        double cur_ascent = accum_vertical_align + font_info->ascent * iter->font_size;
+        if(cur_ascent > ascent)
+            ascent = cur_ascent;
+        double cur_descent = accum_vertical_align + font_info->descent * iter->font_size;
+        if(cur_descent < descent)
+            descent = cur_descent;
+    }
+}
+
+
+void HTMLTextLine::optimize(std::vector<HTMLTextLine*> & lines)
+{
+    if(param.optimize_text == 3)
+    {
+        optimize_aggressive(lines);
+    }
+    else
+    {
+        optimize_normal(lines);
+    }
+}
+/*
+ * Adjust letter space and word space in order to reduce the number of HTML elements
+ * May also unmask word space
+ */
+void HTMLTextLine::optimize_normal(std::vector<HTMLTextLine*> & lines)
+{
+    // remove useless states in the end
+    while((!states.empty()) && (states.back().start_idx >= text.size()))
+        states.pop_back();
+
+    assert(!states.empty());
+
+    const long long word_space_umask = State::umask_by_id(State::WORD_SPACE_ID);
+
+    // for optimization, we need accurate values
+    auto & ls_manager = all_manager.letter_space;
+    auto & ws_manager = all_manager.word_space;
+    
+    // statistics of widths
+    std::map<double, size_t> width_map;
+    // store optimized offsets
+    std::vector<Offset> new_offsets;
+    new_offsets.reserve(offsets.size());
+
+    auto offset_iter1 = offsets.begin();
+    for(auto state_iter2 = states.begin(), state_iter1 = state_iter2++; 
+            state_iter1 != states.end(); 
+            ++state_iter1, ++state_iter2)
+    {
+        const size_t text_idx1 = state_iter1->start_idx;
+        const size_t text_idx2 = (state_iter2 == states.end()) ? text.size() : state_iter2->start_idx;
+        size_t text_count = text_idx2 - text_idx1;
+
+        // there might be some offsets before the first state
+        while((offset_iter1 != offsets.end()) 
+                && (offset_iter1->start_idx <= text_idx1))
+        {
+            new_offsets.push_back(*(offset_iter1++));
+        }
+
+        // find the last offset covered by the current state
+        auto offset_iter2 = offset_iter1;
+        for(; (offset_iter2 != offsets.end()) && (offset_iter2->start_idx <= text_idx2); ++offset_iter2) { }
+
+        // There are `offset_count` <span>'s, the target is to reduce this number
+        size_t offset_count = offset_iter2 - offset_iter1;
+        assert(text_count >= offset_count);
+        
+        // Optimize letter space
+        // how much letter_space is changed
+        // will be later used for optimizing word space
+        double letter_space_diff = 0; 
+        width_map.clear();
+
+        // In some PDF files all letter spaces are implemented as position shifts between each letter
+        // try to simplify it with a proper letter space
+        if(offset_count > 0)
+        {
+            // mark the current letter_space
+            if(text_count > offset_count)
+                width_map.insert(std::make_pair(0, text_count - offset_count));
+
+            for(auto off_iter = offset_iter1; off_iter != offset_iter2; ++off_iter)
+            {
+                const double target = off_iter->width;
+                auto iter = width_map.lower_bound(target-EPS);
+                if((iter != width_map.end()) && (std::abs(iter->first - target) <= EPS))
+                {
+                    ++ iter->second;
+                }
+                else
+                {
+                    width_map.insert(iter, std::make_pair(target, 1));
+                }
+            }
+            
+            // TODO snapping the widths may result a better result
+            // e.g. for (-0.7 0.6 -0.2 0.3 10 10), 0 is better than 10
+            double most_used_width = 0;
+            size_t max_count = 0;
+            for(auto iter = width_map.begin(); iter != width_map.end(); ++iter)
+            {
+                if(iter->second > max_count)
+                {
+                    most_used_width = iter->first;
+                    max_count = iter->second;
+                }
+            }
+
+            // negative letter space may cause problems
+            if((max_count <= text_count / 2) || (!is_positive(state_iter1->letter_space + most_used_width)))
+            { 
+                // the old value is the best
+                // just copy old offsets
+                new_offsets.insert(new_offsets.end(), offset_iter1, offset_iter2);
+            }
+            else
+            {
+                // now we would like to adjust letter space to most_used width
+                
+                // install new letter space
+                const double old_ls = state_iter1->letter_space;
+                state_iter1->ids[State::LETTER_SPACE_ID] = ls_manager.install(old_ls + most_used_width, &(state_iter1->letter_space));
+                letter_space_diff = old_ls - state_iter1->letter_space;
+                // update offsets
+                auto off_iter = offset_iter1; 
+                // re-count number of offsets
+                offset_count = 0;
+                for(size_t cur_text_idx = text_idx1; cur_text_idx < text_idx2; ++cur_text_idx)
+                {
+                    double cur_width = 0;
+                    if((off_iter != offset_iter2) && (off_iter->start_idx == cur_text_idx + 1))
+                    {
+                        cur_width = off_iter->width + letter_space_diff;
+                        ++off_iter;
+                    }
+                    else
+                    {
+                        cur_width = letter_space_diff ;
+                    }
+                    if(!equal(cur_width, 0))
+                    {
+                        new_offsets.emplace_back(cur_text_idx+1, cur_width);
+                        ++ offset_count;
+                    }
+                }
+            }
+        }
+
+        // Optimize word space
+        
+        // In some PDF files all spaces are converted into positioning shift
+        // We may try to change (some of) them to ' ' by adjusting word_space
+        // for now, we consider only the no-space scenario
+        // which also includes the case when param.space_as_offset is set
+
+        // get the text segment covered by current state (*state_iter1)
+        const auto text_iter1 = text.begin() + text_idx1;
+        const auto text_iter2 = text.begin() + text_idx2;
+        if(find(text_iter1, text_iter2, ' ') == text_iter2)
+        {
+            // if there is not any space, we may change the value of word_space arbitrarily
+            // note that we may only change word space, no offset will be affected
+            // The actual effect will emerge during flushing, where it could be detected that an offset can be optimized as a single space character
+            
+            if(offset_count > 0)
+            {
+                double threshold = (state_iter1->em_size()) * (param.space_threshold);
+                // set word_space for the most frequently used offset
+                double most_used_width = 0;
+                size_t max_count = 0;
+
+                // if offset_count > 0, we must have updated width_map in the previous step
+                // find the most frequent width, with new letter space applied
+                for(auto iter = width_map.begin(); iter != width_map.end(); ++iter)
+                {
+                    double fixed_width = iter->first + letter_space_diff; // this is the actual offset in HTML
+                    // we don't want to add spaces for tiny gaps, or even negative shifts
+                    if((fixed_width >= threshold - EPS) && (iter->second > max_count))
+                    {
+                        max_count = iter->second;
+                        most_used_width = fixed_width;
+                    }
+                }
+
+                state_iter1->word_space = 0; // clear word_space for single_space_offset
+                double new_word_space = most_used_width - state_iter1->single_space_offset();
+                state_iter1->ids[State::WORD_SPACE_ID] = ws_manager.install(new_word_space, &(state_iter1->word_space)); // install new word_space
+                state_iter1->hash_umask &= (~word_space_umask); // mark that the word_space is not free
+            }
+            else // there is no offset at all
+            {
+                state_iter1->hash_umask |= word_space_umask; // we just free word_space
+            }
+        }
+        offset_iter1 = offset_iter2;
+    } 
+    
+    // apply optimization
+    std::swap(offsets, new_offsets);
+
+    lines.push_back(this);
+}
+
+// for optimize-text == 3
+void HTMLTextLine::optimize_aggressive(std::vector<HTMLTextLine*> & lines)
+{
+    /*
+    HTMLLineState original_line_state = line_state;
+    // break the line if there are a large (positive or negative) shift
+    // letter space / word space are not taken into consideration (yet)
+    while(true) 
+    {
+    }
+
+    // aggressive optimization
+    if(target > state_iter1->em_size() * (param.space_threshold) - EPS)
+        out << ' ';
+    dx = 0;
+    lines.push_back(this);
+    */
+}
+
+// this state will be converted to a child node of the node of prev_state
+// dump the difference between previous state
+// also clone corresponding states
+void HTMLTextLine::State::begin (ostream & out, const State * prev_state)
+{
+    if(prev_state)
+    {
+        long long cur_mask = 0xff;
+        bool first = true;
+        for(int i = 0; i < HASH_ID_COUNT; ++i, cur_mask<<=8)
+        {
+            if(hash_umask & cur_mask) // we don't care about this ID
+            {
+                if (prev_state->hash_umask & cur_mask) // if prev_state do not care about it either
+                    continue;
+
+                // otherwise
+                // we have to inherit it
+                ids[i] = prev_state->ids[i]; 
+                hash_umask &= (~cur_mask);
+                //copy the corresponding value
+                //TODO: this is so ugly
+                switch(i)
+                {
+                    case FONT_SIZE_ID:
+                        font_size = prev_state->font_size;
+                        break;
+                    case LETTER_SPACE_ID:
+                        letter_space = prev_state->letter_space;
+                        break;
+                    case WORD_SPACE_ID:
+                        word_space = prev_state->word_space;
+                        break;
+                    default:
+                        cerr << "unexpected state mask" << endl;
+                        break;
+                }
+            }
+
+            // now we care about the ID
+            
+            // if the value from prev_state is the same, we don't need to dump it
+            if((!(prev_state->hash_umask & cur_mask)) && (prev_state->ids[i] == ids[i]))
+                continue;
+
+            // so we have to dump it
+            if(first)
+            { 
+                out << "<span class=\"";
+                first = false;
+            }
+            else
+            {
+                out << ' ';
+            }
+
+            // out should have hex set
+            out << css_class_names[i];
+            if (ids[i] == -1)
+                out << CSS::INVALID_ID;
+            else
+                out << ids[i];
+        }
+        // vertical align
+        if(!equal(vertical_align, 0))
+        {
+            // so we have to dump it
+            if(first)
+            { 
+                out << "<span class=\"";
+                first = false;
+            }
+            else
+            {
+                out << ' ';
+            }
+
+            // out should have hex set
+            out << CSS::VERTICAL_ALIGN_CN;
+            auto id = ids[VERTICAL_ALIGN_ID];
+            if (id == -1)
+                out << CSS::INVALID_ID;
+            else
+                out << id;
+        }
+
+        if(first) // we actually just inherit the whole prev_state
+        {
+            need_close = false;
+        }
+        else
+        {
+            out << "\">";
+            need_close = true;
+        }
+    }
+    else
+    {
+        // prev_state == nullptr
+        // which means this is the first state of the line
+        // there should be a open pending <div> left there
+        // it is not necessary to output vertical align
+        long long cur_mask = 0xff;
+        for(int i = 0; i < HASH_ID_COUNT; ++i, cur_mask<<=8)
+        {
+            if(hash_umask & cur_mask) // we don't care about this ID
+                continue;
+
+            // now we care about the ID
+            out << ' '; 
+            // out should have hex set
+            out << css_class_names[i];
+            if (ids[i] == -1)
+                out << CSS::INVALID_ID;
+            else
+                out << ids[i];
+        }
+
+        out << "\">";
+        need_close = false;
+    }
+}
+
+void HTMLTextLine::State::end(ostream & out) const
+{
+    if(need_close)
+        out << "</span>";
+}
+
+void HTMLTextLine::State::hash(void)
+{
+    hash_value = 0;
+    for(int i = 0; i < ID_COUNT; ++i)
+    {
+        hash_value = (hash_value << 8) | (ids[i] & 0xff);
+    }
+}
+
+int HTMLTextLine::State::diff(const State & s) const
+{
+    /*
+     * A quick check based on hash_value
+     * it could be wrong when there are more then 256 classes, 
+     * in which case the output may not be optimal, but still 'correct' in terms of HTML
+     */
+    long long common_mask = ~(hash_umask | s.hash_umask);
+    if((hash_value & common_mask) == (s.hash_value & common_mask)) return 0;
+
+    long long cur_mask = 0xff;
+    int d = 0;
+    for(int i = 0; i < ID_COUNT; ++i)
+    {
+        if((common_mask & cur_mask) && (ids[i] != s.ids[i]))
+            ++ d;
+        cur_mask <<= 8;
+    }
+    return d;
+}
+
+long long HTMLTextLine::State::umask_by_id(int id)
+{
+    return (((long long)0xff) << (8*id));
+}
+
+// the order should be the same as in the enum
+const char * const HTMLTextLine::State::css_class_names [] = {
+    CSS::FONT_FAMILY_CN,
+    CSS::FONT_SIZE_CN,
+    CSS::FILL_COLOR_CN,
+    CSS::STROKE_COLOR_CN,
+    CSS::LETTER_SPACE_CN,
+    CSS::WORD_SPACE_CN,
+    CSS::VERTICAL_ALIGN_CN,
+};
+
+} //namespace pdf2htmlEX
diff --git a/src/HTMLTextLine.h b/src/HTMLTextLine.h
new file mode 100644
index 0000000..fcce811
--- /dev/null
+++ b/src/HTMLTextLine.h
@@ -0,0 +1,134 @@
+/*
+ * Header file for HTMLTextLine
+ * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com>
+ */
+#ifndef HTMLTEXTLINE_H__
+#define HTMLTEXTLINE_H__
+
+#include <ostream>
+#include <vector>
+
+#include <CharTypes.h>
+
+#include "Param.h"
+#include "StateManager.h"
+#include "HTMLState.h"
+
+namespace pdf2htmlEX {
+
+/*
+ * Store and optimize a line of text in HTML
+ *
+ * contains a series of 
+ *  - Text
+ *  - Shift
+ *  - State change
+ */
+class HTMLTextLine
+{
+public:
+    HTMLTextLine (const HTMLLineState & line_state, const Param & param, AllStateManager & all_manager);
+
+    struct State : public HTMLTextState {
+        // before output
+        void begin(std::ostream & out, const State * prev_state);
+        // after output
+        void end(std::ostream & out) const;
+        // calculate the hash code
+        void hash(void);
+        // calculate the difference between another State
+        int diff(const State & s) const;
+
+        enum {
+            FONT_ID,
+            FONT_SIZE_ID,
+            FILL_COLOR_ID,
+            STROKE_COLOR_ID,
+            LETTER_SPACE_ID,
+            WORD_SPACE_ID,
+            HASH_ID_COUNT,
+
+            VERTICAL_ALIGN_ID = HASH_ID_COUNT,
+            ID_COUNT
+        };
+
+        static long long umask_by_id(int id);
+
+        long long ids[ID_COUNT];
+
+        size_t start_idx; // index of the first Text using this state
+        // for optimization
+        long long hash_value;
+        long long hash_umask; // some states may not be actually used
+        bool need_close;
+
+        static const char * const css_class_names []; // class names for each id
+    };
+
+    struct Offset {
+        Offset(size_t size_idx, double width)
+            :start_idx(size_idx),width(width)
+        { }
+        size_t start_idx; // should put this Offset right before text[start_idx];
+        double width;
+    };
+
+    /**
+     * Append a drawn char (glyph)'s unicode. l > 1 mean this glyph correspond to
+     * multiple code points.
+     */
+    void append_unicodes(const Unicode * u, int l, double width);
+    /**
+     * Append a special padding char with 0 width, in order to keep char index consistent.
+     * The padding char is ignored during output.
+     */
+    void append_padding_char() { text.push_back(0); }
+    void append_offset(double width);
+    void append_state(const HTMLTextState & text_state);
+    void dump_text(std::ostream & out);
+
+    bool text_empty(void) const { return text.empty(); }
+    void clear(void);
+
+    void clip(const HTMLClipState &);
+
+    /*
+     * Optimize and calculate necessary values
+     */
+    void prepare(void);
+    void optimize(std::vector<HTMLTextLine*> &);
+private:
+    void optimize_normal(std::vector<HTMLTextLine*> &);
+    void optimize_aggressive(std::vector<HTMLTextLine*> &);
+
+    /**
+     * Dump chars' unicode to output stream.
+     * begin/pos is the index in 'text'.
+     */
+    void dump_chars(std::ostream & out, int begin, int len);
+    void dump_char(std::ostream & out, int pos);
+
+    const Param & param;
+    AllStateManager & all_manager;
+
+    HTMLLineState line_state;
+    double ascent, descent;
+    double clip_x1, clip_y1;
+    double width;
+
+    std::vector<State> states;
+    std::vector<Offset> offsets;
+
+    /**
+     * Drawn chars (glyph) in this line are stored in 'text'. For each element c in 'text':
+     * - If c > 0, it is the unicode code point corresponds to the glyph;
+     * - If c == 0, it is a padding char, and ignored during output (TODO some bad PDFs utilize 0?);
+     * - If c < -1, this glyph corresponds to more than one unicode code points,
+     *   which are stored in 'decomposed_text', and (-c-1) is the index in 'decomposed_text'.
+     */
+    std::vector<int> text;
+    std::vector<std::vector<Unicode> > decomposed_text;
+};
+
+} // namespace pdf2htmlEX
+#endif //HTMLTEXTLINE_H__
diff --git a/src/HTMLTextPage.cc b/src/HTMLTextPage.cc
new file mode 100644
index 0000000..a8e2ab8
--- /dev/null
+++ b/src/HTMLTextPage.cc
@@ -0,0 +1,147 @@
+/*
+ * HTMLTextPage.cc
+ *
+ * Generate and optimized HTML for one Page
+ *
+ * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#include "HTMLTextPage.h"
+#include "util/css_const.h"
+
+namespace pdf2htmlEX {
+
+using std::ostream;
+
+HTMLTextPage::HTMLTextPage(const Param & param, AllStateManager & all_manager)
+    : param(param)
+    , all_manager(all_manager)
+    , cur_line(nullptr)
+    , page_width(0)
+    , page_height(0)
+{ } 
+
+HTMLTextPage::~HTMLTextPage()
+{
+    for(auto p : text_lines)
+        delete p;
+}
+
+void HTMLTextPage::dump_text(ostream & out)
+{
+    if(param.optimize_text)
+    {
+        // text lines may be split during optimization, collect them
+        std::vector<HTMLTextLine*> new_text_lines;
+        for(auto p : text_lines)
+            p->optimize(new_text_lines);
+        std::swap(text_lines, new_text_lines);
+    }
+    for(auto p : text_lines)
+        p->prepare();
+    if(param.optimize_text)
+        optimize();
+
+    HTMLClipState page_box;
+    page_box.xmin = page_box.ymin = 0;
+    page_box.xmax = page_width;
+    page_box.ymax = page_height;
+
+    //push a dummy entry for convenience
+    clips.emplace_back(page_box, text_lines.size());
+
+    Clip cur_clip(page_box, 0);
+    bool has_clip = false;
+
+    auto text_line_iter = text_lines.begin();
+    for(auto clip_iter = clips.begin(); clip_iter != clips.end(); ++clip_iter)
+    {
+        auto next_text_line_iter = text_lines.begin() + clip_iter->start_idx;
+        if(text_line_iter != next_text_line_iter)
+        {
+            const auto & cs = cur_clip.clip_state;
+            if(has_clip)
+            {
+                out << "<div class=\"" << CSS::CLIP_CN
+                    << " " << CSS::LEFT_CN   << all_manager.left.install(cs.xmin)
+                    << " " << CSS::BOTTOM_CN << all_manager.bottom.install(cs.ymin)
+                    << " " << CSS::WIDTH_CN  << all_manager.width.install(cs.xmax - cs.xmin)
+                    << " " << CSS::HEIGHT_CN << all_manager.height.install(cs.ymax - cs.ymin)
+                    << "\">";
+            }
+
+            while(text_line_iter != next_text_line_iter)
+            {
+                if(has_clip)
+                {
+                    (*text_line_iter)->clip(cs);
+                }
+                (*text_line_iter)->dump_text(out);
+                ++text_line_iter;
+            }
+            if(has_clip)
+            {
+                out << "</div>";
+            }
+        }
+
+        {
+            cur_clip = *clip_iter;
+            const auto & cs = cur_clip.clip_state;
+            has_clip = !(equal(0, cs.xmin) && equal(0, cs.ymin)
+                    && equal(page_width, cs.xmax) && equal(page_height, cs.ymax));
+        }
+    }
+}
+
+void HTMLTextPage::dump_css(ostream & out)
+{
+    //TODO
+}
+
+void HTMLTextPage::clear(void)
+{
+    text_lines.clear();
+    clips.clear();
+    cur_line = nullptr;
+}
+
+void HTMLTextPage::open_new_line(const HTMLLineState & line_state)
+{
+    // do not reused the last text_line even if it's empty
+    // because the clip states may point to the next index
+    text_lines.emplace_back(new HTMLTextLine(line_state, param, all_manager));
+    cur_line = text_lines.back();
+}
+
+void HTMLTextPage::set_page_size(double width, double height)
+{
+    page_width = width;
+    page_height = height;
+}
+
+void HTMLTextPage::clip(const HTMLClipState & clip_state)
+{
+    if(!clips.empty())
+    {
+        auto & clip = clips.back();
+        if(clip.start_idx == text_lines.size())
+        {
+            /*
+             * Previous ClipBox is not used
+             */
+            clip.clip_state = clip_state;
+            return;
+        }
+    }
+    clips.emplace_back(clip_state, text_lines.size());
+}
+
+void HTMLTextPage::optimize(void)
+{
+    //TODO
+    //group lines with same x-axis
+    //collect common states 
+}
+
+} // namespace pdf2htmlEX
diff --git a/src/HTMLTextPage.h b/src/HTMLTextPage.h
new file mode 100644
index 0000000..ccaa564
--- /dev/null
+++ b/src/HTMLTextPage.h
@@ -0,0 +1,66 @@
+/*
+ * Header file for HTMLTextPage
+ * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#ifndef HTMLTEXTPAGE_H__
+#define HTMLTEXTPAGE_H__
+
+#include <vector>
+#include <ostream>
+
+#include "Param.h"
+#include "StateManager.h"
+#include "HTMLTextLine.h"
+#include "HTMLState.h"
+
+namespace pdf2htmlEX {
+
+/*
+ * Store and optimize a page of text in HTML
+ *
+ * contains a series of HTMLTextLine
+ */
+class HTMLTextPage
+{
+public:
+    HTMLTextPage (const Param & param, AllStateManager & all_manager);
+    ~HTMLTextPage();
+
+    HTMLTextLine * get_cur_line(void) const { return cur_line; }
+
+    void dump_text(std::ostream & out);
+    void dump_css(std::ostream & out);
+    void clear(void);
+
+    void open_new_line(const HTMLLineState & line_state);
+    
+    /* for clipping */
+    void set_page_size(double width, double height);
+    void clip(const HTMLClipState & clip_state);
+
+    double get_width() { return page_width; }
+    double get_height() { return page_height; }
+
+private:
+    void optimize(void);
+
+    const Param & param;
+    AllStateManager & all_manager;
+    HTMLTextLine * cur_line;
+    double page_width, page_height;
+
+    std::vector<HTMLTextLine*> text_lines;
+
+    struct Clip {
+        HTMLClipState clip_state;
+        size_t start_idx;
+        Clip(const HTMLClipState & clip_state, size_t start_idx)
+            :clip_state(clip_state),start_idx(start_idx)
+        { }
+    };
+    std::vector<Clip> clips;
+};
+
+} //namespace pdf2htmlEX 
+#endif //HTMLTEXTPAGE_H__
diff --git a/src/Param.h b/src/Param.h
new file mode 100644
index 0000000..84fa426
--- /dev/null
+++ b/src/Param.h
@@ -0,0 +1,87 @@
+/*
+ * Parameters
+ *
+ * Wang Lu
+ * 2012.08.03
+ */
+
+
+#ifndef PARAM_H__
+#define PARAM_H__
+
+#include <string>
+
+namespace pdf2htmlEX {
+
+struct Param
+{
+    // pages
+    int first_page, last_page;
+
+    // dimensions
+    double zoom;
+    double fit_width, fit_height;
+    int use_cropbox;
+    double h_dpi, v_dpi;
+
+    // output
+    int embed_css;
+    int embed_font;
+    int embed_image;
+    int embed_javascript;
+    int embed_outline;
+    int split_pages;
+    std::string dest_dir;
+    std::string css_filename;
+    std::string page_filename;
+    std::string outline_filename;
+    int process_nontext;
+    int process_outline;
+    int process_annotation;
+    int process_form;
+    int correct_text_visibility;
+    int printing;
+    int fallback;
+    int tmp_file_size_limit;
+
+    // fonts
+    int embed_external_font;
+    std::string font_format;
+    int decompose_ligature;
+    int auto_hint;
+    std::string external_hint_tool;
+    int stretch_narrow_glyph;
+    int squeeze_wide_glyph;
+    int override_fstype;
+    int process_type3;
+
+    // text
+    double h_eps, v_eps;
+    double space_threshold;
+    double font_size_multiplier;
+    int space_as_offset;
+    int tounicode;
+    int optimize_text;
+
+    // background image
+    std::string bg_format;
+    int svg_node_count_limit;
+    int svg_embed_bitmap;
+
+    // encryption
+    std::string owner_password, user_password;
+    int no_drm;
+
+    // misc.
+    int clean_tmp;
+    std::string data_dir;
+    std::string tmp_dir;
+    int debug;
+    int proof;
+
+    std::string input_filename, output_filename;
+};
+
+} // namespace pdf2htmlEX
+
+#endif //PARAM_h__
diff --git a/src/Preprocessor.cc b/src/Preprocessor.cc
new file mode 100644
index 0000000..a8859ad
--- /dev/null
+++ b/src/Preprocessor.cc
@@ -0,0 +1,107 @@
+/*
+ * Preprocessor.cc
+ *
+ * Check used codes for each font
+ *
+ * by WangLu
+ * 2012.09.07
+ */
+
+#include <cstring>
+#include <iostream>
+#include <algorithm>
+
+#include <GfxState.h>
+#include <GfxFont.h>
+
+#include "Preprocessor.h"
+#include "util/misc.h"
+#include "util/const.h"
+
+namespace pdf2htmlEX {
+
+using std::cerr;
+using std::endl;
+using std::flush;
+using std::max;
+
+Preprocessor::Preprocessor(const Param & param)
+    : OutputDev()
+    , param(param)
+    , max_width(0)
+    , max_height(0)
+    , cur_font_id(0)
+    , cur_code_map(nullptr)
+{ }
+
+Preprocessor::~Preprocessor(void)
+{
+    for(auto & p : code_maps)
+        delete [] p.second;
+}
+
+void Preprocessor::process(PDFDoc * doc)
+{
+    int page_count = (param.last_page - param.first_page + 1);
+    for(int i = param.first_page; i <= param.last_page ; ++i) 
+    {
+        cerr << "Preprocessing: " << (i-param.first_page) << "/" << page_count << '\r' << flush;
+
+        doc->displayPage(this, i, DEFAULT_DPI, DEFAULT_DPI,
+                0, 
+                (!(param.use_cropbox)),
+                true,  // crop
+                false, // printing
+                nullptr, nullptr, nullptr, nullptr);
+    }
+    if(page_count >= 0)
+        cerr << "Preprocessing: " << page_count << "/" << page_count;
+    cerr << endl;
+}
+
+void Preprocessor::drawChar(GfxState *state, double x, double y,
+      double dx, double dy,
+      double originX, double originY,
+      CharCode code, int nBytes, Unicode *u, int uLen)
+{
+    GfxFont * font = state->getFont();
+    if(!font) return;
+
+    long long fn_id = hash_ref(font->getID());
+
+    if(fn_id != cur_font_id)
+    {
+        cur_font_id = fn_id;
+        auto p = code_maps.insert(std::make_pair(cur_font_id, (char*)nullptr));
+        if(p.second)
+        {
+            // this is a new font
+            int len = font->isCIDFont() ? 0x10000 : 0x100;
+            p.first->second = new char [len];
+            memset(p.first->second, 0, len * sizeof(char));
+        }
+
+        cur_code_map = p.first->second;
+    }
+
+    cur_code_map[code] = 1;
+}
+
+void Preprocessor::startPage(int pageNum, GfxState *state)
+{
+    startPage(pageNum, state, nullptr);
+}
+
+void Preprocessor::startPage(int pageNum, GfxState *state, XRef * xref)
+{
+    max_width = max<double>(max_width, state->getPageWidth());
+    max_height = max<double>(max_height, state->getPageHeight());
+}
+
+const char * Preprocessor::get_code_map (long long font_id) const
+{
+    auto iter = code_maps.find(font_id);
+    return (iter == code_maps.end()) ? nullptr : (iter->second);
+}
+
+} // namespace pdf2htmlEX
diff --git a/src/Preprocessor.h b/src/Preprocessor.h
new file mode 100644
index 0000000..5b48e4f
--- /dev/null
+++ b/src/Preprocessor.h
@@ -0,0 +1,66 @@
+/*
+ * Preprocessor.h
+ *
+ * PDF is so complicated that we have to scan twice
+ *
+ * Check used codes for each font
+ * Collect all used link destinations
+ *
+ * by WangLu
+ * 2012.09.07
+ */
+
+
+#ifndef PREPROCESSOR_H__
+#define PREPROCESSOR_H__
+
+#include <unordered_map>
+
+#include <OutputDev.h>
+#include <PDFDoc.h>
+#include <Annot.h>
+#include "Param.h"
+
+namespace pdf2htmlEX {
+
+class Preprocessor : public OutputDev {
+public:
+    Preprocessor(const Param & param);
+    virtual ~Preprocessor(void);
+
+    void process(PDFDoc * doc);
+
+    virtual GBool upsideDown() { return gFalse; }
+    virtual GBool useDrawChar() { return gTrue; }
+    virtual GBool interpretType3Chars() { return gFalse; }
+    virtual GBool needNonText() { return gFalse; }
+    virtual GBool needClipToCropBox() { return gTrue; }
+
+    virtual void drawChar(GfxState *state, double x, double y,
+      double dx, double dy,
+      double originX, double originY,
+      CharCode code, int nBytes, Unicode *u, int uLen);
+
+    // Start a page.
+    // UGLY: These 2 versions are for different versions of poppler
+    virtual void startPage(int pageNum, GfxState *state);
+    virtual void startPage(int pageNum, GfxState *state, XRef * xref);
+
+    const char * get_code_map (long long font_id) const;
+    double get_max_width (void) const { return max_width; }
+    double get_max_height (void) const { return max_height; }
+
+protected:
+    const Param & param;
+
+    double max_width, max_height;
+
+    long long cur_font_id;
+    char * cur_code_map;
+
+    std::unordered_map<long long, char*> code_maps;
+};
+
+} // namespace pdf2htmlEX
+
+#endif //PREPROCESSOR_H__
diff --git a/src/StateManager.h b/src/StateManager.h
new file mode 100644
index 0000000..0a19df0
--- /dev/null
+++ b/src/StateManager.h
@@ -0,0 +1,430 @@
+/*
+ * StateManager.h
+ *
+ * manage reusable CSS classes
+ *
+ * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#ifndef STATEMANAGER_H__
+#define STATEMANAGER_H__
+
+#include <iostream>
+#include <map>
+#include <unordered_map>
+
+#include "Color.h"
+
+#include "util/math.h"
+#include "util/css_const.h"
+
+namespace pdf2htmlEX {
+
+template<class ValueType, class Imp> class StateManager {};
+
+template<class Imp>
+class StateManager<double, Imp>
+{
+public:
+    StateManager()
+        : eps(0)
+        , imp(static_cast<Imp*>(this))
+    { }
+
+    // values no farther than eps are treated as equal
+    void set_eps (double eps) { 
+        this->eps = eps; 
+    }
+
+    double get_eps (void) const {
+        return eps;
+    }
+
+    // install new_value into the map
+    // return the corresponding id
+    long long install(double new_value, double * actual_value_ptr = nullptr) {
+        auto iter = value_map.lower_bound(new_value - eps);
+        if((iter != value_map.end()) && (std::abs(iter->first - new_value) <= eps))
+        {
+            if(actual_value_ptr != nullptr)
+                *actual_value_ptr = iter->first;
+            return iter->second;
+        }
+
+        long long id = value_map.size();
+        double v = value_map.insert(iter, std::make_pair(new_value, id))->first;
+        if(actual_value_ptr != nullptr)
+            *actual_value_ptr = v;
+        return id;
+    }
+
+    void dump_css(std::ostream & out) {
+        for(auto & p : value_map)
+        {
+            out << "." << imp->get_css_class_name() << p.second << "{";
+            imp->dump_value(out, p.first);
+            out << "}" << std::endl;
+        }
+    }
+
+    void dump_print_css(std::ostream & out, double scale) {
+        for(auto & p : value_map)
+        {
+            out << "." << imp->get_css_class_name() << p.second << "{";
+            imp->dump_print_value(out, p.first, scale);
+            out << "}" << std::endl;
+        }
+    }
+
+protected:
+    double eps;
+    Imp * imp;
+    std::map<double, long long> value_map;
+};
+
+// Be careful about the mixed usage of Matrix and const double *
+// the input is usually double *, which might be changed, so we have to copy the content out
+// in the map we use Matrix instead of double * such that the array may be automatically release when deconstructing
+template <class Imp>
+class StateManager<Matrix, Imp>
+{
+public:
+    StateManager()
+        : imp(static_cast<Imp*>(this))
+    { }
+
+    // return id
+    long long install(const double * new_value) {
+        Matrix m;
+        memcpy(m.m, new_value, sizeof(m.m));
+        auto iter = value_map.lower_bound(m);
+        if((iter != value_map.end()) && (tm_equal(m.m, iter->first.m, 4)))
+        {
+            return iter->second;
+        }
+
+        long long id = value_map.size();
+        value_map.insert(iter, std::make_pair(m, id));
+        return id;
+    }
+
+    void dump_css(std::ostream & out) {
+        for(auto & p : value_map)
+        {
+            out << "." << imp->get_css_class_name() << p.second << "{";
+            imp->dump_value(out, p.first);
+            out << "}" << std::endl;
+        }
+    }
+
+    void dump_print_css(std::ostream & out, double scale) {}
+
+protected:
+    Imp * imp;
+
+    struct Matrix_less
+    {
+        bool operator () (const Matrix & m1, const Matrix & m2) const
+        {
+            // Note that we only care about the first 4 elements
+            for(int i = 0; i < 4; ++i)
+            {
+                if(m1.m[i] < m2.m[i])
+                    return true;
+                if(m1.m[i] > m2.m[i])
+                    return false;
+            }
+            return false;
+        }
+    };
+
+    std::map<Matrix, long long, Matrix_less> value_map;
+};
+
+template <class Imp>
+class StateManager<Color, Imp>
+{
+public:
+    StateManager()
+        : imp(static_cast<Imp*>(this))
+    { }
+
+    long long install(const Color & new_value) { 
+        auto iter = value_map.find(new_value);
+        if(iter != value_map.end())
+        {
+            return iter->second;
+        }
+
+        long long id = value_map.size();
+        value_map.insert(std::make_pair(new_value, id));
+        return id;
+    }
+
+    void dump_css(std::ostream & out) {
+        out << "." << imp->get_css_class_name() << CSS::INVALID_ID << "{";
+        imp->dump_transparent(out);
+        out << "}" << std::endl;
+
+        for(auto & p : value_map)
+        {
+            out << "." << imp->get_css_class_name() << p.second << "{";
+            imp->dump_value(out, p.first);
+            out << "}" << std::endl;
+        }
+    }
+
+    void dump_print_css(std::ostream & out, double scale) {}
+
+protected:
+    Imp * imp;
+
+    struct Color_hash 
+    {
+        size_t operator () (const Color & color) const
+        {
+            if(color.transparent)
+            {
+                return (~((size_t)0));
+            }
+            else
+            {
+                return ( ((((size_t)colToByte(color.rgb.r)) & 0xff) << 16) 
+                        | ((((size_t)colToByte(color.rgb.g)) & 0xff) << 8) 
+                        | (((size_t)colToByte(color.rgb.b)) & 0xff)
+                        );
+            }
+        }
+    };
+
+    std::unordered_map<Color, long long, Color_hash> value_map;
+};
+
+/////////////////////////////////////
+// Specific state managers
+
+class FontSizeManager : public StateManager<double, FontSizeManager>
+{
+public:
+    static const char * get_css_class_name (void) { return CSS::FONT_SIZE_CN; }
+    double default_value(void) { return 0; }
+    void dump_value(std::ostream & out, double value) { out << "font-size:" << round(value) << "px;"; }
+    void dump_print_value(std::ostream & out, double value, double scale) { out << "font-size:" << round(value*scale) << "pt;"; }
+};
+
+class LetterSpaceManager : public StateManager<double,  LetterSpaceManager>
+{
+public:
+    static const char * get_css_class_name (void) { return CSS::LETTER_SPACE_CN; }
+    double default_value(void) { return 0; }
+    void dump_value(std::ostream & out, double value) { out << "letter-spacing:" << round(value) << "px;"; }
+    void dump_print_value(std::ostream & out, double value, double scale) { out << "letter-spacing:" << round(value*scale) << "pt;"; }
+};
+
+class WordSpaceManager : public StateManager<double, WordSpaceManager>
+{
+public:
+    static const char * get_css_class_name (void) { return CSS::WORD_SPACE_CN;}
+    double default_value(void) { return 0; }
+    void dump_value(std::ostream & out, double value) { out << "word-spacing:" << round(value) << "px;"; }
+    void dump_print_value(std::ostream & out, double value, double scale) { out << "word-spacing:" << round(value*scale) << "pt;"; }
+};
+
+class VerticalAlignManager : public StateManager<double, VerticalAlignManager>
+{
+public:
+    static const char * get_css_class_name (void) { return CSS::VERTICAL_ALIGN_CN; }
+    double default_value(void) { return 0; }
+    void dump_value(std::ostream & out, double value) { out << "vertical-align:" << round(value) << "px;"; }
+    void dump_print_value(std::ostream & out, double value, double scale) { out << "vertical-align:" << round(value*scale) << "pt;"; }
+};
+
+class WhitespaceManager : public StateManager<double, WhitespaceManager>
+{
+public:
+    static const char * get_css_class_name (void) { return CSS::WHITESPACE_CN; }
+    double default_value(void) { return 0; }
+    void dump_value(std::ostream & out, double value) { 
+        out << ((value > 0) ? "width:"
+                            : "margin-left:")
+            << round(value) << "px;";
+    }
+    void dump_print_value(std::ostream & out, double value, double scale) 
+    {
+        value *= scale;
+        out << ((value > 0) ? "width:"
+                            : "margin-left:")
+            << round(value) << "pt;";
+    }
+};
+
+class WidthManager : public StateManager<double, WidthManager>
+{
+public:
+    static const char * get_css_class_name (void) { return CSS::WIDTH_CN; }
+    double default_value(void) { return 0; }
+    void dump_value(std::ostream & out, double value) { out << "width:" << round(value) << "px;"; }
+    void dump_print_value(std::ostream & out, double value, double scale) { out << "width:" << round(value*scale) << "pt;"; }
+};
+
+class BottomManager : public StateManager<double, BottomManager>
+{
+public:
+    static const char * get_css_class_name (void) { return CSS::BOTTOM_CN; }
+    double default_value(void) { return 0; }
+    void dump_value(std::ostream & out, double value) { out << "bottom:" << round(value) << "px;"; }
+    void dump_print_value(std::ostream & out, double value, double scale) { out << "bottom:" << round(value*scale) << "pt;"; }
+};
+
+class HeightManager : public StateManager<double, HeightManager>
+{
+public:
+    static const char * get_css_class_name (void) { return CSS::HEIGHT_CN; }
+    double default_value(void) { return 0; }
+    void dump_value(std::ostream & out, double value) { out << "height:" << round(value) << "px;"; }
+    void dump_print_value(std::ostream & out, double value, double scale) { out << "height:" << round(value*scale) << "pt;"; }
+};
+
+class LeftManager : public StateManager<double, LeftManager>
+{
+public:
+    static const char * get_css_class_name (void) { return CSS::LEFT_CN; }
+    double default_value(void) { return 0; }
+    void dump_value(std::ostream & out, double value) { out << "left:" << round(value) << "px;"; }
+    void dump_print_value(std::ostream & out, double value, double scale) { out << "left:" << round(value*scale) << "pt;"; }
+};
+
+class TransformMatrixManager : public StateManager<Matrix, TransformMatrixManager>
+{
+public:
+    static const char * get_css_class_name (void) { return CSS::TRANSFORM_MATRIX_CN; }
+    const double * default_value(void) { return ID_MATRIX; }
+    void dump_value(std::ostream & out, const Matrix & matrix) { 
+        // always ignore tm[4] and tm[5] because
+        // we have already shifted the origin
+        // TODO: recognize common matrices
+        const auto & m = matrix.m;
+        auto prefixes = {"", "-ms-", "-webkit-"};
+        if(tm_equal(m, ID_MATRIX, 4))
+        {
+            for(auto & s : prefixes)
+                out << s << "transform:none;";
+        }
+        else
+        {
+            for(auto & s : prefixes)
+            {
+                // PDF use a different coordinate system from Web
+                out << s << "transform:matrix("
+                    << round(m[0]) << ','
+                    << round(-m[1]) << ','
+                    << round(-m[2]) << ','
+                    << round(m[3]) << ',';
+                out << "0,0);";
+            }
+        }
+    }
+};
+
+class FillColorManager : public StateManager<Color, FillColorManager>
+{
+public:
+    static const char * get_css_class_name (void) { return CSS::FILL_COLOR_CN; }
+    /* override base's method, as we need some workaround in CSS */ 
+    void dump_css(std::ostream & out) { 
+        for(auto & p : value_map)
+        {
+            out << "." << get_css_class_name() << p.second 
+                << "{color:" << p.first << ";}" << std::endl;
+        }
+    }
+};
+
+class StrokeColorManager : public StateManager<Color, StrokeColorManager>
+{
+public:
+    static const char * get_css_class_name (void) { return CSS::STROKE_COLOR_CN; }
+    /* override base's method, as we need some workaround in CSS */ 
+    void dump_css(std::ostream & out) { 
+        // normal CSS
+        out << "." << get_css_class_name() << CSS::INVALID_ID << "{text-shadow:none;}" << std::endl;
+        for(auto & p : value_map)
+        {
+            // TODO: take the stroke width from the graphics state,
+            //       currently using 0.015em as a good default
+            out << "." << get_css_class_name() << p.second << "{text-shadow:" 
+                << "-0.015em 0 "  << p.first << "," 
+                << "0 0.015em "   << p.first << ","
+                << "0.015em 0 "   << p.first << ","
+                << "0 -0.015em  " << p.first << ";"
+                << "}" << std::endl;
+        }
+        // webkit
+        out << CSS::WEBKIT_ONLY << "{" << std::endl;
+        out << "." << get_css_class_name() << CSS::INVALID_ID << "{-webkit-text-stroke:0px transparent;}" << std::endl;
+        for(auto & p : value_map)
+        {
+            out << "." << get_css_class_name() << p.second 
+                << "{-webkit-text-stroke:0.015em " << p.first << ";text-shadow:none;}" << std::endl;
+        }
+        out << "}" << std::endl;
+    }
+};
+
+/////////////////////////////////////
+/*
+ * Manage the background image sizes
+ *
+ * We don't merge similar values, since they are bound with PAGE_CONTENT_BOX_number
+ */
+class BGImageSizeManager
+{
+public:
+    void install(int page_no, double width, double height){
+        value_map.insert(std::make_pair(page_no, std::make_pair(width, height)));
+    }
+
+    void dump_css(std::ostream & out) {
+        for(auto & p : value_map)
+        {
+            const auto & s = p.second;
+            out << "." << CSS::PAGE_CONTENT_BOX_CN << p.first << "{";
+            out << "background-size:" << round(s.first) << "px " << round(s.second) << "px;";
+            out << "}" << std::endl;
+        }
+    }
+
+    void dump_print_css(std::ostream & out, double scale) {
+        for(auto & p : value_map)
+        {
+            const auto & s = p.second;
+            out << "." << CSS::PAGE_CONTENT_BOX_CN << p.first << "{";
+            out << "background-size:" << round(s.first * scale) << "pt " << round(s.second * scale) << "pt;";
+            out << "}" << std::endl;
+        }
+    }
+
+private:
+    std::unordered_map<int, std::pair<double,double>> value_map; 
+};
+
+struct AllStateManager
+{
+    TransformMatrixManager transform_matrix;
+    VerticalAlignManager     vertical_align;
+    StrokeColorManager         stroke_color;
+    LetterSpaceManager         letter_space;
+    WhitespaceManager            whitespace;
+    WordSpaceManager             word_space;
+    FillColorManager             fill_color;
+    FontSizeManager               font_size;
+    BottomManager                    bottom;
+    HeightManager                    height;
+    WidthManager                      width;
+    LeftManager                        left;
+    BGImageSizeManager         bgimage_size;
+};
+
+} // namespace pdf2htmlEX 
+
+#endif //STATEMANAGER_H__
diff --git a/src/StringFormatter.cc b/src/StringFormatter.cc
new file mode 100644
index 0000000..b361c2d
--- /dev/null
+++ b/src/StringFormatter.cc
@@ -0,0 +1,30 @@
+#include <cstdarg>
+#include <algorithm>
+#include <cassert>
+
+#include "StringFormatter.h"
+
+namespace pdf2htmlEX {
+
+StringFormatter::GuardedPointer StringFormatter::operator () (const char * format, ...) 
+{
+    assert((buf_cnt == 0) && "StringFormatter: buffer is reused!");
+
+    va_list vlist;
+    va_start(vlist, format);
+    int l = vsnprintf(&buf.front(), buf.capacity(), format, vlist);
+    va_end(vlist);
+    if(l >= (int)buf.capacity()) 
+    {
+        buf.reserve(std::max<long>((long)(l+1), (long)buf.capacity() * 2));
+        va_start(vlist, format);
+        l = vsnprintf(&buf.front(), buf.capacity(), format, vlist);
+        va_end(vlist);
+    }
+    assert(l >= 0); // we should fail when vsnprintf fail
+    assert(l < (int)buf.capacity());
+    return GuardedPointer(this);
+}
+
+} //namespace pdf2htmlEX
+
diff --git a/src/StringFormatter.h b/src/StringFormatter.h
new file mode 100644
index 0000000..dd3f3c1
--- /dev/null
+++ b/src/StringFormatter.h
@@ -0,0 +1,43 @@
+/*
+ * Buffer reusing string formatter
+ *
+ * by WangLu
+ * 2012.11.29
+ */
+
+#ifndef STRINGFORMATTER_H__
+#define STRINGFORMATTER_H__
+
+#include <vector>
+#include <cstdio>
+
+namespace pdf2htmlEX {
+
+class StringFormatter
+{
+public:
+    struct GuardedPointer
+    {
+        GuardedPointer(StringFormatter * sf) : sf(sf) { ++(sf->buf_cnt); }
+        GuardedPointer(const GuardedPointer & gp) : sf(gp.sf) { ++(sf->buf_cnt); }
+        ~GuardedPointer(void) { --(sf->buf_cnt); }
+        operator char* () const { return &(sf->buf.front()); }
+    private:
+        StringFormatter * sf;
+    };
+
+    StringFormatter() : buf_cnt(0) { buf.reserve(L_tmpnam); }
+    /*
+     * Important:
+     * there is only one buffer, so new strings will replace old ones
+     */
+    GuardedPointer operator () (const char * format, ...);
+
+private:
+    friend class GuardedPointer;
+    std::vector<char> buf;
+    int buf_cnt;
+};
+
+} //namespace pdf2htmlEX
+#endif //STRINGFORMATTER_H__
diff --git a/src/TmpFiles.cc b/src/TmpFiles.cc
new file mode 100644
index 0000000..1184548
--- /dev/null
+++ b/src/TmpFiles.cc
@@ -0,0 +1,77 @@
+/*
+ * TmpFiles.cc
+ *
+ * Collect and clean-up temporary files
+ *
+ * implemented by WangLu
+ * split off by Filodej <philodej@gmail.com>
+ */
+
+#include <iostream>
+#include <cstdio>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "TmpFiles.h"
+#include "Param.h"
+
+#ifdef __MINGW32__
+#include "util/mingw.h"
+#endif
+
+using namespace std;
+
+namespace pdf2htmlEX {
+
+TmpFiles::TmpFiles( const Param& param )
+    : param( param )
+{ }
+
+TmpFiles::~TmpFiles()
+{
+    clean();
+}
+
+void TmpFiles::add( const string & fn)
+{
+    if(!param.clean_tmp)
+        return;
+
+    if(tmp_files.insert(fn).second && param.debug)
+        cerr << "Add new temporary file: " << fn << endl;
+}
+
+// Return the total size of the temporary files in bytes
+double TmpFiles::get_total_size() const
+{
+    double total_size = 0;
+    struct stat st;
+    for(auto & fn : tmp_files)
+    {
+        stat(fn.c_str(), &st);
+        total_size += st.st_size;
+    }
+
+    return total_size;
+}
+
+
+void TmpFiles::clean()
+{
+    if(!param.clean_tmp)
+        return;
+
+    for(auto & fn : tmp_files)
+    {
+        remove(fn.c_str());
+        if(param.debug)
+            cerr << "Remove temporary file: " << fn << endl;
+    }
+
+    rmdir(param.tmp_dir.c_str());
+    if(param.debug)
+        cerr << "Remove temporary directory: " << param.tmp_dir << endl;
+}
+
+} // namespace pdf2htmlEX
+
diff --git a/src/TmpFiles.h b/src/TmpFiles.h
new file mode 100644
index 0000000..277281d
--- /dev/null
+++ b/src/TmpFiles.h
@@ -0,0 +1,28 @@
+#ifndef TMPFILES_H__
+#define TMPFILES_H__
+
+#include <string>
+#include <set>
+#include "Param.h"
+
+namespace pdf2htmlEX {
+
+class TmpFiles
+{
+public:
+    explicit TmpFiles( const Param& param );
+    ~TmpFiles();
+
+    void add( const std::string& fn);
+    double get_total_size() const;
+
+private:
+    void clean();
+
+    const Param& param;
+    std::set<std::string> tmp_files;
+};
+
+} // namespace pdf2htmlEX
+
+#endif //TMPFILES_H__
diff --git a/src/css_class_names.cmakelists.txt b/src/css_class_names.cmakelists.txt
new file mode 100644
index 0000000..067d95a
--- /dev/null
+++ b/src/css_class_names.cmakelists.txt
@@ -0,0 +1,39 @@
+# vim: filetype=cmake :
+# CSS class names
+
+# Note
+# don't use: (otherwise conflicted with others when there is an ID suffix)
+# p f s
+
+set(CSS_INVALID_ID          "_")
+
+set(CSS_LINE_CN             "t") # Text 
+set(CSS_TRANSFORM_MATRIX_CN "m") # Matrix
+set(CSS_CLIP_CN             "c") # Clip
+
+set(CSS_PAGE_FRAME_CN       "pf") # Page Frame
+set(CSS_PAGE_CONTENT_BOX_CN "pc") # Page Content
+set(CSS_PAGE_DATA_CN        "pi") # Page Info
+
+set(CSS_BACKGROUND_IMAGE_CN "bi")      # Background Image
+set(CSS_FULL_BACKGROUND_IMAGE_CN "bf") # Background image (Full)
+
+set(CSS_FONT_FAMILY_CN      "ff") # Font Family
+set(CSS_FONT_SIZE_CN        "fs") # Font Size
+
+set(CSS_FILL_COLOR_CN       "fc") # Fill Color
+set(CSS_STROKE_COLOR_CN     "sc") # Stroke Color
+
+set(CSS_LETTER_SPACE_CN     "ls") # Letter Space
+set(CSS_WORD_SPACE_CN       "ws") # Word Space
+set(CSS_VERTICAL_ALIGN_CN   "v") # Vertical align
+set(CSS_WHITESPACE_CN       "_") # whitespace
+set(CSS_LEFT_CN             "x") # X
+set(CSS_HEIGHT_CN           "h") # Height
+set(CSS_WIDTH_CN            "w") # Width
+set(CSS_BOTTTOM_CN          "y") # Y
+set(CSS_CSS_DRAW_CN         "d") # Draw
+set(CSS_LINK_CN             "l") # Link
+set(CSS_INPUT_TEXT_CN       "it") # Text input
+set(CSS_INPUT_RADIO_CN      "ir") # Radio button
+set(CSS_RADIO_CHECKED_CN    "checked") # Show picture of checked out radio button
diff --git a/src/pdf2htmlEX-config.h.in b/src/pdf2htmlEX-config.h.in
new file mode 100644
index 0000000..7c9b510
--- /dev/null
+++ b/src/pdf2htmlEX-config.h.in
@@ -0,0 +1,24 @@
+/*
+ * config.h
+ * Compile time constants
+ *
+ * Copyright (C) 2012-2014 Lu Wang <coolwanglu@gmail.com>
+ */
+
+
+#ifndef PDF2HTMLEX_CONFIG_H__
+#define PDF2HTMLEX_CONFIG_H__
+
+#include <string>
+
+#define ENABLE_SVG @ENABLE_SVG@
+
+namespace pdf2htmlEX {
+
+static const std::string PDF2HTMLEX_VERSION = "@PDF2HTMLEX_VERSION@";
+static const std::string PDF2HTMLEX_PREFIX = "@CMAKE_INSTALL_PREFIX@";
+static const std::string PDF2HTMLEX_DATA_PATH = "@CMAKE_INSTALL_PREFIX@""/share/pdf2htmlEX";
+
+} // namespace pdf2htmlEX
+
+#endif //PDF2HTMLEX_CONFIG_H__
diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc
new file mode 100644
index 0000000..b56e8e9
--- /dev/null
+++ b/src/pdf2htmlEX.cc
@@ -0,0 +1,445 @@
+// pdf2htmlEX.cc
+//
+// Copyright (C) 2012-2015 Lu Wang <coolwanglu@gmail.com>
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstddef>
+#include <cstring>
+#include <ctime>
+#include <string>
+#include <limits>
+#include <iostream>
+#include <memory>
+#include <errno.h>
+
+#include <getopt.h>
+
+#include <poppler-config.h>
+#include <goo/GooString.h>
+
+#include <Object.h>
+#include <PDFDoc.h>
+#include <PDFDocFactory.h>
+#include <GlobalParams.h>
+
+#include "pdf2htmlEX-config.h"
+
+#if ENABLE_SVG
+#include <cairo.h>
+#endif
+
+#include "ArgParser.h"
+#include "Param.h"
+#include "HTMLRenderer/HTMLRenderer.h"
+
+#include "util/path.h"
+#include "util/ffw.h"
+
+#ifdef __MINGW32__
+#include "util/mingw.h"
+#endif
+
+using namespace std;
+using namespace pdf2htmlEX;
+
+Param param;
+ArgParser argparser;
+
+void show_usage_and_exit(const char * dummy = nullptr)
+{
+    cerr << "Usage: pdf2htmlEX [options] <input.pdf> [<output.html>]" << endl;
+    argparser.show_usage(cerr);
+    exit(EXIT_FAILURE);
+}
+
+void show_version_and_exit(const char * dummy = nullptr)
+{
+    cerr << "pdf2htmlEX version " << PDF2HTMLEX_VERSION << endl;
+    cerr << "Copyright 2012-2015 Lu Wang <coolwanglu@gmail.com> and other contributors" << endl;
+    cerr << "Libraries: " << endl;
+    cerr << "  poppler " << POPPLER_VERSION << endl;
+    cerr << "  libfontforge " << ffw_get_version() << endl;
+#if ENABLE_SVG
+    cerr << "  cairo " << cairo_version_string() << endl;
+#endif
+    cerr << "Default data-dir: " << param.data_dir << endl;
+    cerr << "Supported image format:";
+#ifdef ENABLE_LIBPNG
+    cerr << " png";
+#endif
+#ifdef ENABLE_LIBJPEG
+    cerr << " jpg";
+#endif
+#if ENABLE_SVG
+    cerr << " svg";
+#endif
+    cerr << endl;
+
+    cerr << endl;
+    exit(EXIT_SUCCESS);
+}
+
+void embed_parser (const char * str)
+{
+    while(true)
+    {
+        switch(*str)
+        {
+            case '\0': return; break;
+            case 'c': param.embed_css = 0; break;
+            case 'C': param.embed_css = 1; break;
+            case 'f': param.embed_font = 0; break;
+            case 'F': param.embed_font = 1; break;
+            case 'i': param.embed_image = 0; break;
+            case 'I': param.embed_image = 1; break;
+            case 'j': param.embed_javascript = 0; break;
+            case 'J': param.embed_javascript = 1; break;
+            case 'o': param.embed_outline = 0; break;
+            case 'O': param.embed_outline = 1; break;
+            default:
+                cerr << "Unknown character `" << (*str) << "` for --embed" << endl;
+                break;
+        }
+        ++ str;
+    }
+}
+
+void prepare_directories()
+{
+    std::string tmp_dir = param.tmp_dir + "/pdf2htmlEX-XXXXXX";
+
+    errno = 0;
+
+    unique_ptr<char> pBuf(new char[tmp_dir.size() + 1]);
+    strcpy(pBuf.get(), tmp_dir.c_str());
+    auto p = mkdtemp(pBuf.get());
+    if(p == nullptr)
+    {
+        const char * errmsg = strerror(errno);
+        if(!errmsg)
+        {
+            errmsg = "unknown error";
+        }
+        cerr << "Cannot create temp directory: " << errmsg << endl;
+        exit(EXIT_FAILURE);
+    }
+    param.tmp_dir = pBuf.get();
+}
+
+void parse_options (int argc, char **argv)
+{
+    argparser
+        // pages
+        .add("first-page,f", &param.first_page, 1, "first page to convert")
+        .add("last-page,l", &param.last_page, numeric_limits<int>::max(), "last page to convert")
+
+        // dimensions
+        .add("zoom", &param.zoom, 0, "zoom ratio", true)
+        .add("fit-width", &param.fit_width, 0, "fit width to <fp> pixels", true)
+        .add("fit-height", &param.fit_height, 0, "fit height to <fp> pixels", true)
+        .add("use-cropbox", &param.use_cropbox, 1, "use CropBox instead of MediaBox")
+        .add("hdpi", &param.h_dpi, 144.0, "horizontal resolution for graphics in DPI")
+        .add("vdpi", &param.v_dpi, 144.0, "vertical resolution for graphics in DPI")
+
+        // output files
+        .add("embed", "specify which elements should be embedded into output", embed_parser, true)
+        .add("embed-css", &param.embed_css, 1, "embed CSS files into output")
+        .add("embed-font", &param.embed_font, 1, "embed font files into output")
+        .add("embed-image", &param.embed_image, 1, "embed image files into output")
+        .add("embed-javascript", &param.embed_javascript, 1, "embed JavaScript files into output")
+        .add("embed-outline", &param.embed_outline, 1, "embed outlines into output")
+        .add("split-pages", &param.split_pages, 0, "split pages into separate files")
+        .add("dest-dir", &param.dest_dir, ".", "specify destination directory")
+        .add("css-filename", &param.css_filename, "", "filename of the generated css file")
+        .add("page-filename", &param.page_filename, "", "filename template for split pages ")
+        .add("outline-filename", &param.outline_filename, "", "filename of the generated outline file")
+        .add("process-nontext", &param.process_nontext, 1, "render graphics in addition to text")
+        .add("process-outline", &param.process_outline, 1, "show outline in HTML")
+        .add("process-annotation", &param.process_annotation, 0, "show annotation in HTML")
+        .add("process-form", &param.process_form, 0, "include text fields and radio buttons")
+        .add("printing", &param.printing, 1, "enable printing support")
+        .add("fallback", &param.fallback, 0, "output in fallback mode")
+        .add("tmp-file-size-limit", &param.tmp_file_size_limit, -1, "Maximum size (in KB) used by temporary files, -1 for no limit.")
+
+        // fonts
+        .add("embed-external-font", &param.embed_external_font, 1, "embed local match for external fonts")
+        .add("font-format", &param.font_format, "woff", "suffix for embedded font files (ttf,otf,woff,svg)")
+        .add("decompose-ligature", &param.decompose_ligature, 0, "decompose ligatures, such as \uFB01 -> fi")
+        .add("auto-hint", &param.auto_hint, 0, "use fontforge autohint on fonts without hints")
+        .add("external-hint-tool", &param.external_hint_tool, "", "external tool for hinting fonts (overrides --auto-hint)")
+        .add("stretch-narrow-glyph", &param.stretch_narrow_glyph, 0, "stretch narrow glyphs instead of padding them")
+        .add("squeeze-wide-glyph", &param.squeeze_wide_glyph, 1, "shrink wide glyphs instead of truncating them")
+        .add("override-fstype", &param.override_fstype, 0, "clear the fstype bits in TTF/OTF fonts")
+        .add("process-type3", &param.process_type3, 0, "convert Type 3 fonts for web (experimental)")
+
+        // text
+        .add("heps", &param.h_eps, 1.0, "horizontal threshold for merging text, in pixels")
+        .add("veps", &param.v_eps, 1.0, "vertical threshold for merging text, in pixels")
+        .add("space-threshold", &param.space_threshold, (1.0/8), "word break threshold (threshold * em)")
+        .add("font-size-multiplier", &param.font_size_multiplier, 4.0, "a value greater than 1 increases the rendering accuracy")
+        .add("space-as-offset", &param.space_as_offset, 0, "treat space characters as offsets")
+        .add("tounicode", &param.tounicode, 0, "how to handle ToUnicode CMaps (0=auto, 1=force, -1=ignore)")
+        .add("optimize-text", &param.optimize_text, 0, "try to reduce the number of HTML elements used for text")
+        .add("correct-text-visibility", &param.correct_text_visibility, 0, "try to detect texts covered by other graphics and properly arrange them")
+
+        // background image
+        .add("bg-format", &param.bg_format, "png", "specify background image format")
+        .add("svg-node-count-limit", &param.svg_node_count_limit, -1, "if node count in a svg background image exceeds this limit,"
+                " fall back this page to bitmap background; negative value means no limit.")
+        .add("svg-embed-bitmap", &param.svg_embed_bitmap, 1, "1: embed bitmaps in svg background; 0: dump bitmaps to external files if possible.")
+
+        // encryption
+        .add("owner-password,o", &param.owner_password, "", "owner password (for encrypted files)", true)
+        .add("user-password,u", &param.user_password, "", "user password (for encrypted files)", true)
+        .add("no-drm", &param.no_drm, 0, "override document DRM settings")
+
+        // misc.
+        .add("clean-tmp", &param.clean_tmp, 1, "remove temporary files after conversion")
+        .add("tmp-dir", &param.tmp_dir, param.tmp_dir, "specify the location of temporary directory.")
+        .add("data-dir", &param.data_dir, param.data_dir, "specify data directory")
+        .add("debug", &param.debug, 0, "print debugging information")
+        .add("proof", &param.proof, 0, "texts are drawn on both text layer and background for proof.")
+
+        // meta
+        .add("version,v", "print copyright and version info", &show_version_and_exit)
+        .add("help,h", "print usage information", &show_usage_and_exit)
+
+        .add("", &param.input_filename, "", "")
+        .add("", &param.output_filename, "", "")
+        ;
+
+    try
+    {
+        argparser.parse(argc, argv);
+    }
+    catch(const char * s)
+    {
+        // if s == "", getopt_long would have printed the error message
+        if(s && s[0])
+        {
+            cerr << "Error when parsing the arguments:" << endl;
+            cerr << s << endl;
+        }
+        exit(EXIT_FAILURE);
+    }
+    catch(const std::string & s)
+    {
+        // if s == "", getopt_long would have printed the error message
+        if(s != "")
+        {
+            cerr << "Error when parsing the arguments:" << endl;
+            cerr << s << endl;
+        }
+        exit(EXIT_FAILURE);
+    }
+}
+
+void check_param()
+{
+    if (param.input_filename == "")
+    {
+        show_usage_and_exit();
+    }
+
+    if(param.output_filename.empty())
+    {
+        const string s = get_filename(param.input_filename);
+        if(get_suffix(param.input_filename) == ".pdf")
+        {
+            param.output_filename = s.substr(0, s.size() - 4) + ".html";
+        }
+        else
+        {
+            param.output_filename = s + ".html";
+        }
+    }
+
+    if(param.page_filename.empty())
+    {
+        const string s = get_filename(param.input_filename);
+        if(get_suffix(param.input_filename) == ".pdf")
+        {
+            param.page_filename = s.substr(0, s.size() - 4) + "%d.page";
+        }
+        else
+        {
+            param.page_filename = s + "%d.page";
+        }
+        sanitize_filename(param.page_filename);
+    }
+
+    else
+    {
+        // Need to make sure we have a page number placeholder in the filename
+        if(!sanitize_filename(param.page_filename))
+        {
+            // Inject the placeholder just before the file extension
+            const string suffix = get_suffix(param.page_filename);
+            param.page_filename = param.page_filename.substr(0, param.page_filename.size() - suffix.size()) + "%d" + suffix;
+            sanitize_filename(param.page_filename);
+        }
+    }
+    if(param.css_filename.empty())
+    {
+        const string s = get_filename(param.input_filename);
+
+        if(get_suffix(param.input_filename) == ".pdf")
+        {
+            param.css_filename = s.substr(0, s.size() - 4) + ".css";
+        }
+        else
+        {
+            param.css_filename = s + ".css";
+        }
+    }
+    if(param.outline_filename.empty())
+    {
+        const string s = get_filename(param.input_filename);
+
+        if(get_suffix(param.input_filename) == ".pdf")
+        {
+            param.outline_filename = s.substr(0, s.size() - 4) + ".outline";
+        }
+        else
+        {
+            if(!param.split_pages)
+                param.outline_filename = s + ".outline";
+        }
+    }
+
+    if(false) { }
+#ifdef ENABLE_LIBPNG
+    else if (param.bg_format == "png") { }
+#endif
+#ifdef ENABLE_LIBJPEG
+    else if (param.bg_format == "jpg") { }
+#endif
+#if ENABLE_SVG
+    else if(param.bg_format == "svg") { }
+#endif
+    else
+    {
+        cerr << "Image format not supported: " << param.bg_format << endl;
+        exit(EXIT_FAILURE);
+    }
+
+#if not ENABLE_SVG
+    if(param.process_type3)
+    {
+        cerr << "process-type3 is enabled, however SVG support is not built in this version of pdf2htmlEX." << endl;
+        exit(EXIT_FAILURE);
+    }
+#endif
+
+    if((param.font_format == "ttf") && (param.external_hint_tool == ""))
+    {
+        cerr << "Warning: No hint tool is specified for truetype fonts, the result may be rendered poorly in some circumstances." << endl;
+    }
+
+    if (param.embed_image && (param.bg_format == "svg") && !param.svg_embed_bitmap)
+    {
+        cerr << "Warning: --svg-embed-bitmap is forced on because --embed-image is on, or the dumped bitmaps can't be loaded." << endl;
+        param.svg_embed_bitmap = 1;
+    }
+}
+
+int main(int argc, char **argv)
+{
+    // We need to adjust these directories before parsing the options.
+#if defined(__MINGW32__)
+    param.data_dir = get_exec_dir(argv[0]);
+    param.tmp_dir  = get_tmp_dir();
+#else
+    char const* tmp = getenv("TMPDIR");
+#ifdef P_tmpdir
+    if (!tmp)
+        tmp = P_tmpdir;
+#endif
+#ifdef _PATH_TMP
+    if (!tmp)
+        tmp = _PATH_TMP;
+#endif
+    if (!tmp)
+        tmp = "/tmp";
+    param.tmp_dir = string(tmp);
+    param.data_dir = PDF2HTMLEX_DATA_PATH;
+#endif
+
+    parse_options(argc, argv);
+    check_param();
+
+    //prepare the directories
+    prepare_directories();
+
+    if(param.debug)
+        cerr << "temporary dir: " << (param.tmp_dir) << endl;
+
+    try
+    {
+        create_directories(param.dest_dir);
+    }
+    catch (const string & s)
+    {
+        cerr << s << endl;
+        exit(EXIT_FAILURE);
+    }
+
+    bool finished = false;
+    // read config file
+    globalParams = new GlobalParams();
+    // open PDF file
+    PDFDoc * doc = nullptr;
+    try
+    {
+        {
+            GooString * ownerPW = (param.owner_password == "") ? (nullptr) : (new GooString(param.owner_password.c_str()));
+            GooString * userPW = (param.user_password == "") ? (nullptr) : (new GooString(param.user_password.c_str()));
+            GooString fileName(param.input_filename.c_str());
+
+            doc = PDFDocFactory().createPDFDoc(fileName, ownerPW, userPW);
+
+            delete userPW;
+            delete ownerPW;
+        }
+
+        if (!doc->isOk())
+            throw "Cannot read the file";
+
+        // check for copy permission
+        if (!doc->okToCopy())
+        {
+            if (param.no_drm == 0)
+                throw "Copying of text from this document is not allowed.";
+            cerr << "Document has copy-protection bit set." << endl;
+        }
+
+        param.first_page = min<int>(max<int>(param.first_page, 1), doc->getNumPages());
+        param.last_page = min<int>(max<int>(param.last_page, param.first_page), doc->getNumPages());
+
+
+        unique_ptr<HTMLRenderer>(new HTMLRenderer(param))->process(doc);
+
+        finished = true;
+    }
+    catch (const char * s)
+    {
+        cerr << "Error: " << s << endl;
+    }
+    catch (const string & s)
+    {
+        cerr << "Error: " << s << endl;
+    }
+
+    // clean up
+    delete doc;
+    delete globalParams;
+
+    // check for memory leaks
+    Object::memCheck(stderr);
+    gMemReport(stderr);
+
+    exit(finished ? (EXIT_SUCCESS) : (EXIT_FAILURE));
+
+    return 0;
+}
diff --git a/src/util/const.cc b/src/util/const.cc
new file mode 100644
index 0000000..c85e0d5
--- /dev/null
+++ b/src/util/const.cc
@@ -0,0 +1,53 @@
+/*
+ * Constants
+ *
+ * by WangLu
+ * 2012.11.29
+ */
+
+#include "const.h"
+
+namespace pdf2htmlEX {
+
+using std::map;
+using std::string;
+
+const double ID_MATRIX[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0};
+
+const map<string, string> GB_ENCODED_FONT_NAME_MAP({
+    {"\xCB\xCE\xCC\xE5", "SimSun"},
+    {"\xBA\xDA\xCC\xE5", "SimHei"},
+    {"\xBF\xAC\xCC\xE5_GB2312", "SimKai"},
+    {"\xB7\xC2\xCB\xCE_GB2312", "SimFang"},
+    {"\xC1\xA5\xCA\xE9", "SimLi"},
+});
+
+const std::map<std::string, EmbedStringEntry> EMBED_STRING_MAP({
+    {".css", {&Param::embed_css, 
+              "<style type=\"text/css\">", 
+              "</style>", false,
+              "<link rel=\"stylesheet\" href=\"", 
+              "\"/>" }},
+    {".js", {&Param::embed_javascript,
+             "<script>", 
+             "</script>", false,
+             "<script src=\"",
+             "\"></script>" }},
+    {".png", {&Param::embed_image,
+             "<img alt=\"\" src=\"data:image/png;base64,", 
+             "\"/>", true,
+             "<img alt=\"\" src=\"",
+             "\"/>" }}
+});
+
+const std::map<std::string, std::string> FORMAT_MIME_TYPE_MAP({
+    {"eot", "application/vnd.ms-fontobject"},
+    {"jpg", "image/jpeg"},
+    {"otf", "application/x-font-otf"},
+    {"png", "image/png"},
+    {"svg", "image/svg+xml"},
+    {"ttf", "application/x-font-ttf"},
+    {"woff", "application/font-woff"},
+});
+
+} //namespace pdf2htmlEX
diff --git a/src/util/const.h b/src/util/const.h
new file mode 100644
index 0000000..db29a5c
--- /dev/null
+++ b/src/util/const.h
@@ -0,0 +1,46 @@
+/*
+ * Constants
+ *
+ * by WangLu
+ * 2012.11.29
+ */
+
+#ifndef CONST_H__
+#define CONST_H__
+
+#include <map>
+#include <string>
+
+#include "Param.h"
+
+namespace pdf2htmlEX {
+
+#ifndef nullptr
+#define nullptr (NULL)
+#endif
+
+static const double EPS = 1e-6;
+static const double DEFAULT_DPI = 72.0;
+extern const double ID_MATRIX[6];
+
+// For GB encoded font names
+extern const std::map<std::string, std::string> GB_ENCODED_FONT_NAME_MAP;
+// map to embed files into html
+struct EmbedStringEntry
+{
+    int Param::*embed_flag; 
+    // used when *embed_flag == true
+    std::string prefix_embed;
+    std::string suffix_embed;
+    bool base64_encode;
+    // used when *embed_flag == false
+    std::string prefix_external;
+    std::string suffix_external;
+};
+extern const std::map<std::string, EmbedStringEntry> EMBED_STRING_MAP;
+
+extern const std::map<std::string, std::string> FORMAT_MIME_TYPE_MAP;
+
+} // namespace pdf2htmlEX
+
+#endif //CONST_H__
diff --git a/src/util/css_const.h.in b/src/util/css_const.h.in
new file mode 100644
index 0000000..08c23fc
--- /dev/null
+++ b/src/util/css_const.h.in
@@ -0,0 +1,67 @@
+/* vim: set filetype=cpp : */
+/*
+ * css_const.h
+ *
+ * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#ifndef CSS_CONST_H__
+#define CSS_CONST_H__
+
+
+/*
+ * should be consistent with base.css and pdf2htmlEX.js
+ */
+
+namespace pdf2htmlEX {
+namespace CSS {
+
+// work around strings
+const char * const WEBKIT_ONLY         = "@media screen and (-webkit-min-device-pixel-ratio:0)";
+const char * const PRINT_ONLY          = "@media print";
+
+// usually the class name is XXX_CN or XXX_CN<hex id>
+// sometimes we need a special one, e.g. transparent color, where the id is -1
+const char * const INVALID_ID          = "@CSS_INVALID_ID@";
+
+const char * const LINE_CN             = "@CSS_LINE_CN@";
+const char * const TRANSFORM_MATRIX_CN = "@CSS_TRANSFORM_MATRIX_CN@";
+const char * const CLIP_CN             = "@CSS_CLIP_CN@";
+
+// page_decoration is for shadow etc
+// page_frame cannot have margin or border-width, pdf2htmlEX.js will use it to determine the coordinates
+// page_content holds everything inside the page, could be hidden to speed up rendering
+// page_data holds data for pdf2htmlEX.js
+const char * const PAGE_DECORATION_CN  = "@CSS_PAGE_DECORATION_CN@";
+const char * const PAGE_FRAME_CN       = "@CSS_PAGE_FRAME_CN@";
+const char * const PAGE_CONTENT_BOX_CN = "@CSS_PAGE_CONTENT_BOX_CN@";
+const char * const PAGE_DATA_CN        = "@CSS_PAGE_DATA_CN@";
+
+const char * const BACKGROUND_IMAGE_CN = "@CSS_BACKGROUND_IMAGE_CN@";
+const char * const FULL_BACKGROUND_IMAGE_CN = "@CSS_FULL_BACKGROUND_IMAGE_CN@";
+
+const char * const FONT_FAMILY_CN      = "@CSS_FONT_FAMILY_CN@";
+const char * const FONT_SIZE_CN        = "@CSS_FONT_SIZE_CN@";
+const char * const FILL_COLOR_CN       = "@CSS_FILL_COLOR_CN@";
+const char * const STROKE_COLOR_CN     = "@CSS_STROKE_COLOR_CN@";
+const char * const LETTER_SPACE_CN     = "@CSS_LETTER_SPACE_CN@";
+const char * const WORD_SPACE_CN       = "@CSS_WORD_SPACE_CN@";
+const char * const VERTICAL_ALIGN_CN   = "@CSS_VERTICAL_ALIGN_CN@";
+const char * const WHITESPACE_CN       = "@CSS_WHITESPACE_CN@";
+const char * const LEFT_CN             = "@CSS_LEFT_CN@";
+const char * const HEIGHT_CN           = "@CSS_HEIGHT_CN@";
+const char * const WIDTH_CN            = "@CSS_WIDTH_CN@";
+const char * const BOTTOM_CN           = "@CSS_BOTTTOM_CN@";
+
+const char * const CSS_DRAW_CN         = "@CSS_CSS_DRAW_CN@";
+const char * const LINK_CN             = "@CSS_LINK_CN@";
+
+const char * const INPUT_TEXT_CN       = "@CSS_INPUT_TEXT_CN@";
+const char * const INPUT_RADIO_CN      = "@CSS_INPUT_RADIO_CN@";
+const char * const RADIO_CHECKED_CN    = "@CSS_RADIO_CHECKED_CN@";
+
+}
+}
+
+
+#endif //CSS_CONST_H__
diff --git a/src/util/encoding.cc b/src/util/encoding.cc
new file mode 100644
index 0000000..6b600bc
--- /dev/null
+++ b/src/util/encoding.cc
@@ -0,0 +1,182 @@
+/*
+ * Encodings used in HTML
+ *
+ * by WangLu
+ * 2013.02.15
+ */
+
+#include <cstring>
+
+#include "encoding.h"
+#include "const.h" // for nullptr
+
+namespace pdf2htmlEX {
+
+using std::ostream;
+using std::string;
+
+/*
+ * Copied from UTF.h / UTF8.h in poppler
+ */
+static int mapUTF8(Unicode u, char *buf, int bufSize) 
+{
+    if (u <= 0x0000007f) {
+        if (bufSize < 1) {
+            return 0;
+        }
+        buf[0] = (char)u;
+        return 1;
+    } else if (u <= 0x000007ff) {
+        if (bufSize < 2) {
+            return 0;
+        }
+        buf[0] = (char)(0xc0 + (u >> 6));
+        buf[1] = (char)(0x80 + (u & 0x3f));
+        return 2;
+    } else if (u <= 0x0000ffff) {
+        if (bufSize < 3) {
+            return 0;
+        }
+        buf[0] = (char)(0xe0 + (u >> 12));
+        buf[1] = (char)(0x80 + ((u >> 6) & 0x3f));
+        buf[2] = (char)(0x80 + (u & 0x3f));
+        return 3;
+    } else if (u <= 0x0010ffff) {
+        if (bufSize < 4) {
+            return 0;
+        }
+        buf[0] = (char)(0xf0 + (u >> 18));
+        buf[1] = (char)(0x80 + ((u >> 12) & 0x3f));
+        buf[2] = (char)(0x80 + ((u >> 6) & 0x3f));
+        buf[3] = (char)(0x80 + (u & 0x3f));
+        return 4;
+    } else {
+        return 0;
+    }
+}
+
+void writeUnicodes(ostream & out, const Unicode * u, int uLen)
+{
+    for(int i = 0; i < uLen; ++i)
+    {
+        switch(u[i])
+        {
+            case '&':
+                out << "&amp;";
+                break;
+            case '\"':
+                out << "&quot;";
+                break;
+            case '\'':
+                out << "&apos;";
+                break;
+            case '<':
+                out << "&lt;";
+                break;
+            case '>':
+                out << "&gt;";
+                break;
+            default:
+                {
+                    char buf[4];
+                    auto n = mapUTF8(u[i], buf, 4);
+                    out.write(buf, n);
+                }
+        }
+    }
+}
+
+/*
+static void writeHEX(ostream & out, char c)
+{
+    static const char * hexchars = "0123456789abcdef";
+    out << hexchars[(c>>4)&0xf] << hexchars[c&0xf];
+}
+
+void writeURL(ostream & out, const string & s)
+{
+    static char * dont_escape = nullptr;
+    if(!dont_escape)
+    {
+        dont_escape = new char [256];
+        memset(dont_escape, 0, 256 * sizeof(char));
+        / *
+         * http://tools.ietf.org/html/rfc3986#section-2
+         *
+         * Also includes '%', in case that the original url has been escaped
+         * /
+        const char * no_escape_chars = ":/?#[]@!$&'()*+,;="
+            "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+            "abcdefghijklmnopqrstuvwxyz"
+            "0123456789"
+            "-._~"
+            "%";
+        while(*no_escape_chars)
+            dont_escape[(int)*(no_escape_chars++)] = 1;
+    }
+
+    for (auto iter = s.begin(); iter != s.end(); ++iter)
+    {
+        char c = *iter;
+        if(dont_escape[(int)c])
+            out << c;
+        else
+        {
+            out << '%';
+            writeHEX(out, c);
+        }
+    }
+}
+*/
+
+void writeJSON(ostream & out, const string & s)
+{
+    for(auto c : s)
+    {
+        switch (c)
+        {
+            case '\\': out << "\\\\"; break;
+            case '"': out << "\\\""; break;
+            case '\'': out << "\\\'"; break;
+            case '/': out << "\\/"; break;
+            case '\b': out << "\\b"; break;
+            case '\f': out << "\\f"; break;
+            case '\n': out << "\\n"; break;
+            case '\r': out << "\\r"; break;
+            case '\t': out << "\\t"; break;
+            default: out << c; break;
+        }
+    }
+}
+
+void writeAttribute(std::ostream & out, const std::string & s)
+{
+    for (auto c : s)
+    {
+        switch(c)
+        {
+            case '&':
+                out << "&amp;";
+                break;
+            case '\"':
+                out << "&quot;";
+                break;
+            case '\'':
+                out << "&apos;";
+                break;
+            case '<':
+                out << "&lt;";
+                break;
+            case '>':
+                out << "&gt;";
+                break;
+            case '`': // for IE: http://html5sec.org/#59
+                out << "&#96;";
+                break;
+            default:
+                out << c;
+        }
+    }
+}
+
+} //namespace pdf2htmlEX
diff --git a/src/util/encoding.h b/src/util/encoding.h
new file mode 100644
index 0000000..c4d7732
--- /dev/null
+++ b/src/util/encoding.h
@@ -0,0 +1,41 @@
+/*
+ * Encodings used in HTML
+ *
+ * by WangLu
+ * 2013.02.15
+ */
+
+#ifndef ENCODING_H__
+#define ENCODING_H__
+
+#include <string>
+#include <iostream>
+
+#include <CharTypes.h>
+
+namespace pdf2htmlEX {
+
+/*
+ * Escape necessary characters, and map Unicode to UTF-8
+ */
+void writeUnicodes(std::ostream & out, const Unicode * u, int uLen);
+
+
+/*
+ * URL escaping
+ */
+//void writeURL(std::ostream & out, const std::string & s);
+
+/*
+ * JSON escaping
+ */
+void writeJSON(std::ostream & out, const std::string & s);
+
+/*
+ * HTML tag attribute escaping
+ */
+void writeAttribute(std::ostream & out, const std::string & s);
+
+} // namespace pdf2htmlEX
+
+#endif //ENCODING_H__
diff --git a/src/util/ffw.c b/src/util/ffw.c
new file mode 100644
index 0000000..b88efce
--- /dev/null
+++ b/src/util/ffw.c
@@ -0,0 +1,485 @@
+/*
+ * ffw.c: Fontforge wrapper
+ *
+ * Processing fonts using Fontforge
+ *
+ * Copyright (C) 2012-2014 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <assert.h>
+#include <math.h>
+
+#include <fontforge.h>
+#include <baseviews.h>
+
+#include "ffw.h"
+
+static real EPS=1e-6;
+
+static inline int min(int a, int b)
+{
+    return (a<b)?a:b;
+}
+
+static FontViewBase * cur_fv = NULL;
+static Encoding * original_enc = NULL;
+static Encoding * unicodefull_enc = NULL;
+static Encoding * enc_head = NULL;
+
+static void err(const char * format, ...)
+{
+    va_list al;
+    va_start(al, format);
+    vfprintf(stderr, format, al);
+    va_end(al);
+    exit(-1);
+}
+static char * strcopy(const char * str)
+{
+    if(str == NULL) return NULL;
+
+    char * _ = strdup(str);
+    if(!_)
+        err("Not enough memory");
+    return _;
+}
+
+static void dumb_logwarning(const char * format, ...) { }
+
+static void dumb_post_error(const char * title, const char * error, ...) { }
+
+void ffw_init(int debug)
+{
+    InitSimpleStuff();
+    if ( default_encoding==NULL )
+        default_encoding=FindOrMakeEncoding("ISO8859-1");
+    if ( default_encoding==NULL )
+        default_encoding=&custom; /* In case iconv is broken */
+
+    if(!debug)
+    {
+        //disable error output of Fontforge
+        ui_interface->logwarning = &dumb_logwarning;
+        ui_interface->post_error = &dumb_post_error;
+    }
+
+    original_enc = FindOrMakeEncoding("original");
+    unicodefull_enc = FindOrMakeEncoding("UnicodeFull");
+
+    {
+        Val v;
+        v.type = v_int;
+        v.u.ival = 1;
+        SetPrefs("DetectDiagonalStems", &v, NULL);
+    }
+}
+
+void ffw_finalize(void)
+{
+    while(enc_head)
+    {
+        Encoding * next = enc_head->next;
+        free((void*)enc_head->enc_name);
+        free(enc_head->unicode);
+        if(enc_head->psnames)
+        {
+            int i;
+            for(i = 0; i < enc_head->char_cnt; ++i)
+                free((void*)enc_head->psnames[i]);
+            free(enc_head->psnames);
+        }
+        free(enc_head);
+        enc_head = next;
+    }
+}
+
+long ffw_get_version(void)
+{
+    return FONTFORGE_VERSIONDATE_RAW;
+}
+
+void ffw_new_font()
+{
+    assert((cur_fv == NULL) && "Previous font is not destroyed");
+    cur_fv = FVAppend(_FontViewCreate(SplineFontNew()));
+}
+
+void ffw_load_font(const char * filename)
+{
+    assert((cur_fv == NULL) && "Previous font is not destroyed");
+
+    char * _filename = strcopy(filename);
+    SplineFont * font = LoadSplineFont(_filename, 1);
+
+    free(_filename);
+
+    if(!font)
+        err("Cannot load font %s\n", filename);
+
+    if(!font->fv)
+        FVAppend(_FontViewCreate(font));
+
+    assert(font->fv);
+
+    cur_fv = font->fv;
+}
+
+/*
+ * Fight again dirty stuffs
+ */
+void ffw_prepare_font(void)
+{
+    memset(cur_fv->selected, 1, cur_fv->map->enccount);
+    // remove kern
+    FVRemoveKerns(cur_fv);
+    FVRemoveVKerns(cur_fv);
+
+    /*
+     * Remove Alternate Unicodes
+     * We never use them because we will do a force encoding
+     */
+    int i;
+    SplineFont * sf = cur_fv->sf;
+    for(i = 0; i < sf->glyphcnt; ++i)
+    {
+        SplineChar * sc = sf->glyphs[i];
+        if(sc)
+        {
+            struct altuni * p = sc->altuni;
+            if(p)
+            {
+                AltUniFree(p);
+                sc->altuni = NULL;
+            }
+        }
+    }
+
+    /*
+     * Wipe out font name
+     * browsers may rejects fonts with malformed font names
+     */
+    free(sf->fontname);
+    sf->fontname = strcopy("");
+}
+
+void ffw_save(const char * filename)
+{
+    char * _filename = strcopy(filename);
+    char * _ = strcopy("");
+
+    int r = GenerateScript(cur_fv->sf, _filename
+            , _, -1, -1, NULL, NULL, cur_fv->map, NULL, ly_fore);
+
+    free(_);
+    free(_filename);
+
+    if(!r)
+        err("Cannot save font to %s\n", filename);
+}
+void ffw_close(void)
+{
+    FontViewClose(cur_fv);
+    cur_fv = NULL;
+}
+
+static void ffw_do_reencode(Encoding * encoding, int force)
+{
+    assert(encoding);
+
+    if(force)
+    {
+        SFForceEncoding(cur_fv->sf, cur_fv->map, encoding);
+    }
+    else
+    {
+        EncMapFree(cur_fv->map);
+        cur_fv->map = EncMapFromEncoding(cur_fv->sf, encoding);
+    }
+    if(cur_fv->normal)
+    {
+        EncMapFree(cur_fv->normal);
+        cur_fv->normal = NULL;
+    }
+
+    SFReplaceEncodingBDFProps(cur_fv->sf, cur_fv->map);
+
+    free(cur_fv->selected);
+    cur_fv->selected = calloc(cur_fv->map->enccount, sizeof(char));
+}
+
+void ffw_reencode_glyph_order(void)
+{
+    ffw_do_reencode(original_enc, 0);
+}
+
+void ffw_reencode_unicode_full(void)
+{
+    ffw_do_reencode(unicodefull_enc, 0);
+}
+
+void ffw_reencode(const char * encname, int force)
+{
+    Encoding * enc = FindOrMakeEncoding(encname);
+    if(!enc)
+        err("Unknown encoding %s\n", encname);
+
+    ffw_do_reencode(enc, force);
+}
+
+void ffw_reencode_raw(int32 * mapping, int mapping_len, int force)
+{
+    Encoding * enc = calloc(1, sizeof(Encoding));
+    enc->only_1byte = enc->has_1byte = true;
+
+    int len = (mapping_len < 256) ? 256 : mapping_len;
+    enc->char_cnt = len;
+    enc->unicode = (int32_t*)malloc(len * sizeof(int32_t));
+    memcpy(enc->unicode, mapping, mapping_len * sizeof(int32_t));
+    if(mapping_len < 256)
+    {
+        int i;
+        for(i = mapping_len; i < 256; ++i)
+            enc->unicode[i] = -1;
+    }
+
+    enc->enc_name = strcopy("");
+
+    enc->next = enc_head;
+    enc_head = enc;
+
+    ffw_do_reencode(enc, force);
+}
+
+void ffw_reencode_raw2(char ** mapping, int mapping_len, int force)
+{
+    Encoding * enc = calloc(1, sizeof(Encoding));
+    enc->enc_name = strcopy("");
+    enc->char_cnt = mapping_len;
+    enc->unicode = (int32_t*)malloc(mapping_len * sizeof(int32_t));
+    enc->psnames = (char**)calloc(mapping_len, sizeof(char*));
+    int i;
+    for(i = 0; i < mapping_len; ++i)
+    {
+        if(mapping[i])
+        {
+            enc->unicode[i] = UniFromName(mapping[i], ui_none, &custom);
+            enc->psnames[i] = strcopy(mapping[i]);
+        }
+        else
+        {
+            enc->unicode[i] = -1;
+        }
+    }
+
+    enc->next = enc_head;
+    enc_head = enc;
+
+    ffw_do_reencode(enc, force);
+}
+
+void ffw_cidflatten(void)
+{
+    if(!cur_fv->sf->cidmaster) 
+    {
+        fprintf(stderr, "Cannot flatten a non-CID font\n");
+        return;
+    }
+    SFFlatten(cur_fv->sf->cidmaster);
+}
+
+/*
+ * There is no check if a glyph with the same unicode exists!
+ * TODO: let FontForge fill in the standard glyph name <- or maybe this might cause collision?
+ */
+void ffw_add_empty_char(int32_t unicode, int width)
+{
+    SplineChar * sc = SFMakeChar(cur_fv->sf, cur_fv->map, cur_fv->map->enccount);
+    char buffer[400];
+    SCSetMetaData(sc,
+        strcopy(StdGlyphName(buffer, unicode,
+                cur_fv->sf->uni_interp, cur_fv->sf->for_new_glyphs)),
+        unicode, sc->comment);
+    SCSynchronizeWidth(sc, width, sc->width, cur_fv);
+}
+
+int ffw_get_em_size(void)
+{
+    return cur_fv->sf->ascent + cur_fv->sf->descent;
+}
+
+void ffw_fix_metric()
+{
+    double ascent, descent;
+    ffw_get_metric(&ascent, &descent);
+    ffw_set_metric(ascent, descent);
+}
+
+void ffw_get_metric(double * ascent, double * descent)
+{
+    SplineFont * sf = cur_fv->sf;
+
+    DBounds bb;
+    SplineFontFindBounds(sf, &bb);
+
+    int em = sf->ascent + sf->descent;
+
+    if (em > 0)
+    {
+        *ascent = ((double)bb.maxy) / em;
+        *descent = ((double)bb.miny) / em;
+    }
+    else
+    {
+        *ascent = *descent = 0;
+    }
+}
+
+void ffw_set_metric(double ascent, double descent)
+{
+    SplineFont * sf = cur_fv->sf;
+    struct pfminfo * info = &sf->pfminfo;
+
+    SFDefaultOS2Info(info, sf, sf->fontname);
+    info->pfmset = 1;
+    sf->changed = 1;
+
+    int em = sf->ascent + sf->descent;
+    int a = floor(ascent * em + 0.5);
+    int d = floor(descent * em + 0.5);
+
+    if(a < 0) a = 0;
+    if(d > 0) d = 0;
+
+    /*
+    sf->ascent = min(a, em);
+    sf->descent = em - bb.maxy;
+    */
+
+    /*
+     * The embedded fonts are likely to have inconsistent values for the 3 sets of ascent/descent
+     * PDF viewers don't care, since they don't even use these values
+     * But have to unify them, for different browsers on different platforms
+     * Things may become easier when there are CSS rules for baseline-based positioning.
+     */
+    info->os2_winascent = a;
+    info->os2_typoascent = a;
+    info->hhead_ascent = a;
+    info->winascent_add = 0;
+    info->typoascent_add = 0;
+    info->hheadascent_add = 0;
+
+    info->os2_windescent = -d;
+    info->os2_typodescent = d;
+    info->hhead_descent = d;
+    info->windescent_add = 0;
+    info->typodescent_add = 0;
+    info->hheaddescent_add = 0;
+
+    info->os2_typolinegap = 0;
+    info->linegap = 0;
+}
+
+/*
+ * TODO:bitmap, reference have not been considered in this function
+ */
+void ffw_set_widths(int * width_list, int mapping_len,
+        int stretch_narrow, int squeeze_wide)
+{
+    SplineFont * sf = cur_fv->sf;
+
+    if(sf->onlybitmaps
+            && cur_fv->active_bitmap != NULL
+            && sf->bitmaps != NULL)
+    {
+        printf("TODO: width vs bitmap\n");
+    }
+
+    EncMap * map = cur_fv->map;
+    int i;
+    int imax = min(mapping_len, map->enccount);
+    for(i = 0; i < imax; ++i)
+    {
+        /*
+         * Don't mess with it if the glyphs is not used.
+         */
+        if(width_list[i] == -1)
+        {
+            continue;
+        }
+
+        int j = map->map[i];
+        if(j == -1) continue;
+
+        SplineChar * sc = sf->glyphs[j];
+        if(sc == NULL)
+        {
+            sc = SFMakeChar(cur_fv->sf, cur_fv->map, j);
+        }
+        else if(((sc->width > EPS)
+                && (((sc->width > width_list[i] + EPS) && (squeeze_wide))
+                    || ((sc->width < width_list[i] - EPS) && (stretch_narrow)))))
+        {
+            real transform[6];
+            transform[0] = ((double)width_list[i]) / (sc->width);
+            transform[3] = 1.0;
+            transform[1] = transform[2] = transform[4] = transform[5] = 0;
+            FVTrans(cur_fv, sc, transform, NULL, fvt_alllayers | fvt_dontmovewidth);
+        }
+
+        SCSynchronizeWidth(sc, width_list[i], sc->width, cur_fv);
+    }
+}
+
+void ffw_import_svg_glyph(int code, const char * filename, double ox, double oy, double width)
+{
+    int enc = SFFindSlot(cur_fv->sf, cur_fv->map, code, "");
+    if(enc == -1)
+        return;
+
+    SplineChar * sc = SFMakeChar(cur_fv->sf, cur_fv->map, enc);
+
+    memset(cur_fv->selected, 0, cur_fv->map->enccount);
+    cur_fv->selected[enc] = 1;
+    int ok = FVImportImages(cur_fv, (char*)filename, fv_svg, 0, -1);
+    if(!ok)
+        err("Import SVG glyph failed");
+
+    // correct origin and width
+    {
+        int a = cur_fv->sf->ascent;
+        int d = cur_fv->sf->descent;
+        real transform[6];
+        transform[0] = 1.0;
+        transform[3] = 1.0;
+        transform[1] = transform[2] = 0.0;
+        transform[4] = -ox * (a+d);
+        transform[5] = -oy * (a+d) + d;
+        FVTrans(cur_fv, sc, transform, NULL, fvt_alllayers | fvt_dontmovewidth);
+
+        SCSynchronizeWidth(sc, floor(width * (a+d) + 0.5), sc->width, cur_fv);
+    }
+}
+
+void ffw_auto_hint(void)
+{
+    // convert to quadratic
+    if(!(cur_fv->sf->layers[ly_fore].order2))
+    {
+        SFCloseAllInstrs(cur_fv->sf);
+        SFConvertToOrder2(cur_fv->sf);
+    }
+    memset(cur_fv->selected, 1, cur_fv->map->enccount);
+    FVAutoHint(cur_fv);
+    FVAutoInstr(cur_fv);
+}
+
+void ffw_override_fstype(void)
+{
+    *(int16 *)(&cur_fv->sf->pfminfo.fstype) = 0;
+    cur_fv->sf->pfminfo.pfmset = true;
+    cur_fv->sf->changed = true;
+}
diff --git a/src/util/ffw.h b/src/util/ffw.h
new file mode 100644
index 0000000..a01ed79
--- /dev/null
+++ b/src/util/ffw.h
@@ -0,0 +1,74 @@
+/*
+ * ffw.h : Fontforge Wrapper
+ *
+ * Processing fonts using Fontforge
+ *
+ * fontforge.h cannot be included in C++
+ * So this wrapper in C publishes several functions we need
+ *
+ * by WangLu
+ * 2012.09.03
+ */
+
+
+#ifdef __cplusplus
+#include <cstdint>
+namespace pdf2htmlEX {
+extern "C" {
+#else
+#include <stdint.h>
+#endif
+
+////////////////////////
+// global
+void ffw_init(int debug);
+void ffw_finalize(void);
+long ffw_get_version(void);
+
+////////////////////////
+// load & save
+void ffw_new_font();
+void ffw_load_font(const char * filename);
+void ffw_prepare_font(void);
+
+void ffw_save(const char * filename);
+void ffw_close(void);
+
+////////////////////////
+// encoding
+void ffw_reencode_glyph_order(void);
+void ffw_reencode_unicode_full(void);
+void ffw_reencode_raw(int32_t * mapping, int mapping_len, int force);
+void ffw_reencode_raw2(char ** mapping, int mapping_len, int force);
+
+void ffw_cidflatten(void);
+// add a new empty char into the font
+void ffw_add_empty_char(int32_t unicode, int width);
+
+////////////////////////
+// metrics
+int ffw_get_em_size(void);
+// manipulate ascent and descent
+// ascent is between 0 and 1
+// descent is between -1 and 0
+void ffw_fix_metric();
+// get ascent/descent based on the shape
+void ffw_get_metric(double * ascent, double * descent);
+// set corresponding fields
+void ffw_set_metric(double ascent, double descent);
+
+void ffw_set_widths(int * width_list, int mapping_len, 
+        int stretch_narrow, int squeeze_wide);
+
+////////////////////////
+// others
+// (ox,oy) is the position of the true origin, fractions related to em_size
+// also true for glyph_width
+void ffw_import_svg_glyph(int code, const char * filename, double ox, double oy, double glyph_width);
+void ffw_auto_hint(void);
+void ffw_override_fstype(void);
+
+#ifdef __cplusplus
+}
+}
+#endif
diff --git a/src/util/math.cc b/src/util/math.cc
new file mode 100644
index 0000000..1ddabce
--- /dev/null
+++ b/src/util/math.cc
@@ -0,0 +1,90 @@
+#include <cstring>
+#include <limits>
+#include <algorithm>
+
+#include "math.h"
+
+using std::min;
+using std::max;
+
+namespace pdf2htmlEX {
+
+void tm_transform(const double * tm, double & x, double & y, bool is_delta)
+{
+    double xx = x, yy = y;
+    x = tm[0] * xx + tm[2] * yy;
+    y = tm[1] * xx + tm[3] * yy;
+    if(!is_delta)
+    {
+        x += tm[4];
+        y += tm[5];
+    }
+}
+
+void tm_multiply(double * tm_left, const double * tm_right)
+{
+    double old[4];
+    memcpy(old, tm_left, sizeof(old));
+
+    tm_left[0] = old[0] * tm_right[0] + old[2] * tm_right[1];
+    tm_left[1] = old[1] * tm_right[0] + old[3] * tm_right[1];
+    tm_left[2] = old[0] * tm_right[2] + old[2] * tm_right[3];
+    tm_left[3] = old[1] * tm_right[2] + old[3] * tm_right[3];
+    tm_left[4] += old[0] * tm_right[4] + old[2] * tm_right[5];
+    tm_left[5] += old[1] * tm_right[4] + old[3] * tm_right[5];
+}
+
+void tm_transform_bbox(const double * tm, double * bbox)
+{
+    double & x1 = bbox[0];
+    double & y1 = bbox[1];
+    double & x2 = bbox[2];
+    double & y2 = bbox[3];
+    double _[4][2];
+    _[0][0] = _[1][0] = x1;
+    _[0][1] = _[2][1] = y1;
+    _[2][0] = _[3][0] = x2;
+    _[1][1] = _[3][1] = y2;
+
+    x1 = y1 = std::numeric_limits<double>::max();
+    x2 = y2 = std::numeric_limits<double>::min();
+    for(int i = 0; i < 4; ++i)
+    {
+        auto & x = _[i][0];
+        auto & y = _[i][1];
+        tm_transform(tm, x, y);
+        if(x < x1) x1 = x;
+        if(x > x2) x2 = x;
+        if(y < y1) y1 = y;
+        if(y > y2) y2 = y;
+    }
+}
+
+bool bbox_intersect(const double * bbox1, const double * bbox2, double * result)
+{
+    double x0, y0, x1, y1;
+
+    x0 = max(min(bbox1[0], bbox1[2]), min(bbox2[0], bbox2[2]));
+    x1 = min(max(bbox1[0], bbox1[2]), max(bbox2[0], bbox2[2]));
+
+    if (x0 >= x1)
+        return false;
+
+    y0 = max(min(bbox1[1], bbox1[3]), min(bbox2[1], bbox2[3]));
+    y1 = min(max(bbox1[1], bbox1[3]), max(bbox2[1], bbox2[3]));
+
+    if (y0 >= y1)
+        return false;
+
+    if (result)
+    {
+        result[0] = x0;
+        result[1] = y0;
+        result[2] = x1;
+        result[3] = y1;
+    }
+    return true;
+}
+
+} //namespace pdf2htmlEX 
+
diff --git a/src/util/math.h b/src/util/math.h
new file mode 100644
index 0000000..8302a93
--- /dev/null
+++ b/src/util/math.h
@@ -0,0 +1,59 @@
+/*
+ * Math functions
+ *
+ * by WangLu
+ * 2012.11.29
+ */
+
+#ifndef MATH_H__
+#define MATH_H__
+
+#include <cmath>
+
+#include "const.h"
+
+namespace pdf2htmlEX {
+
+static inline double round(double x) { return (std::abs(x) > EPS) ? x : 0.0; }
+static inline bool equal(double x, double y) { return std::abs(x-y) <= EPS; }
+static inline bool is_positive(double x) { return x > EPS; }
+static inline bool tm_equal(const double * tm1, const double * tm2, int size = 6)
+{
+    for(int i = 0; i < size; ++i)
+        if(!equal(tm1[i], tm2[i]))
+            return false;
+    return true;
+}
+
+static inline void tm_init(double * tm)
+{
+    tm[0] = tm[3] = 1;
+    tm[1] = tm[2] = tm[4] = tm[5] = 0;
+}
+
+static inline void tm_multiply(double * result, const double * m1, const double * m2)
+{
+    result[0] = m1[0] * m2[0] + m1[2] * m2[1];
+    result[1] = m1[1] * m2[0] + m1[3] * m2[1];
+    result[2] = m1[0] * m2[2] + m1[2] * m2[3];
+    result[3] = m1[1] * m2[2] + m1[3] * m2[3];
+    result[4] = m1[0] * m2[4] + m1[2] * m2[5] + m1[4];
+    result[5] = m1[1] * m2[4] + m1[3] * m2[5] + m1[5];
+}
+
+static inline double hypot(double x, double y) { return std::sqrt(x*x+y*y); }
+
+void tm_transform(const double * tm, double & x, double & y, bool is_delta = false);
+void tm_multiply(double * tm_left, const double * tm_right);
+void tm_transform_bbox(const double * tm, double * bbox);
+/**
+ * Calculate the intersection of 2 boxes.
+ * If they are intersecting, store the result to result (if not null) and return true.
+ * Otherwise return false, and result is not touched.
+ * Param result can be same as one of bbox1 and bbox2.
+ * Data in boxes are expected in the order of (x0, y0, x1, y1).
+ */
+bool bbox_intersect(const double * bbox1, const double * bbox2, double * result = nullptr);
+
+} //namespace pdf2htmlEX 
+#endif //MATH_H__
diff --git a/src/util/mingw.cc b/src/util/mingw.cc
new file mode 100644
index 0000000..5d75be0
--- /dev/null
+++ b/src/util/mingw.cc
@@ -0,0 +1,64 @@
+/*
+ * Win32 specific functions
+ *
+ * by MarcSanfacon
+ * 2014.01.13
+ */
+
+#ifdef __MINGW32__
+
+#include <string>
+#include <iostream>
+#include <cstdio>
+#include <cstdlib>
+#include <limits.h>
+#include <libgen.h>
+
+#include "mingw.h"
+
+using namespace std;
+
+char* mkdtemp(char* temp)
+{
+    char *filename = nullptr;
+    if (temp != nullptr) {
+        filename = mktemp(temp);
+        if (filename != nullptr) {
+            if (_mkdir(temp) != 0) {
+                filename = nullptr;
+            }
+        }
+    }
+
+    return filename;
+}
+
+namespace pdf2htmlEX {
+string get_exec_dir(char *dir)
+{
+    // Under Windows, the default data_dir is under /data in the pdf2htmlEX directory
+    string s = dirname(dir);
+    if (s == ".") {
+        char* wd(getcwd(nullptr, PATH_MAX));
+        s = wd;
+        free(wd);
+    }
+    s += "/data";
+    return s;
+}
+
+string get_tmp_dir()
+{
+    // Under Windows, the temp path is not under /tmp, find it.
+    char *tmp = getenv("TMP");
+    if (tmp == nullptr) {
+        tmp = getenv("TEMP");
+    }
+
+    return tmp != nullptr ? string(tmp) + "/" : "/";
+}
+
+} // namespace pdf2htmlEX;
+
+#endif //__MINGW32__
+
diff --git a/src/util/mingw.h b/src/util/mingw.h
new file mode 100644
index 0000000..89abf8a
--- /dev/null
+++ b/src/util/mingw.h
@@ -0,0 +1,29 @@
+/*
+ * Win32 specific functions
+ *
+ * by MarcSanfacon
+ * 2014.01.13
+ */
+
+#ifndef MINGW_H__
+#define MINGW_H__
+
+#ifdef __MINGW32__
+
+#include <io.h>
+
+char *mkdtemp(char *temp);
+
+#include <direct.h>
+#define mkdir(A, B) _mkdir(A)
+#define stat _stat
+
+namespace pdf2htmlEX {
+    std::string     get_exec_dir(char *dir);
+    std::string     get_tmp_dir();
+} // namespace pdf2htmlEX
+
+#endif //__MINGW32__
+
+#endif //MINGW_H__
+
diff --git a/src/util/misc.cc b/src/util/misc.cc
new file mode 100644
index 0000000..e2572c0
--- /dev/null
+++ b/src/util/misc.cc
@@ -0,0 +1,66 @@
+/*
+ * Misc functions
+ *
+ *
+ * by WangLu
+ * 2012.08.10
+ */
+
+#include <map>
+
+#include "misc.h"
+
+using std::cerr;
+using std::endl;
+using std::string;
+using std::map;
+using std::ostream;
+
+namespace pdf2htmlEX {
+
+void css_fix_rectangle_border_width(double x1, double y1, 
+        double x2, double y2, 
+        double border_width, 
+        double & x, double & y, double & w, double & h,
+        double & border_top_bottom_width, 
+        double & border_left_right_width)
+{
+    w = x2 - x1;
+    if(w > border_width)
+    {
+        w -= border_width;
+        border_left_right_width = border_width;
+    }
+    else
+    {
+        border_left_right_width = border_width + w/2;
+        w = 0;
+    }
+    x = x1 - border_width / 2;
+
+    h = y2 - y1;
+    if(h > border_width)
+    {
+        h -= border_width;
+        border_top_bottom_width = border_width;
+    }
+    else
+    {
+        border_top_bottom_width = border_width + h/2;
+        h = 0;
+    }
+    y = y1 - border_width / 2;
+}
+
+ostream & operator << (ostream & out, const GfxRGB & rgb)
+{
+    auto flags= out.flags();
+    out << std::dec << "rgb(" 
+        << (int)colToByte(rgb.r) << "," 
+        << (int)colToByte(rgb.g) << "," 
+        << (int)colToByte(rgb.b) << ")";
+    out.flags(flags);
+    return out;
+}
+
+} // namespace pdf2htmlEX
diff --git a/src/util/misc.h b/src/util/misc.h
new file mode 100644
index 0000000..9032e4e
--- /dev/null
+++ b/src/util/misc.h
@@ -0,0 +1,39 @@
+/*
+ * Help classes and Functions
+ *
+ * by WangLu
+ * 2012.08.10
+ */
+
+
+#ifndef UTIL_H__
+#define UTIL_H__
+
+#include <iostream>
+
+#include <GfxState.h>
+
+#include "util/const.h"
+
+namespace pdf2htmlEX {
+
+static inline long long hash_ref(const Ref * id)
+{
+    return (((long long)(id->num)) << (sizeof(id->gen)*8)) | (id->gen);
+}
+
+/*
+ * In PDF, edges of the rectangle are in the middle of the borders
+ * In HTML, edges are completely outside the rectangle
+ */
+void css_fix_rectangle_border_width(double x1, double y1, double x2, double y2, 
+        double border_width, 
+        double & x, double & y, double & w, double & h,
+        double & border_top_bottom_width, 
+        double & border_left_right_width);
+
+std::ostream & operator << (std::ostream & out, const GfxRGB & rgb);
+
+} // namespace pdf2htmlEX
+
+#endif //UTIL_H__
diff --git a/src/util/namespace.h b/src/util/namespace.h
new file mode 100644
index 0000000..46dcd0f
--- /dev/null
+++ b/src/util/namespace.h
@@ -0,0 +1,21 @@
+/*
+ * namespace.h
+ *
+ * specifying common used namespace 
+ *
+ * by WangLu
+ */
+
+#ifndef NAMESPACE_H__
+#define NAMESPACE_H__
+
+using std::hex;
+using std::dec;
+using std::string;
+using std::endl;
+using std::make_pair;
+using std::ifstream;
+using std::ofstream;
+
+#endif // NAMESPACE_H__
+
diff --git a/src/util/path.cc b/src/util/path.cc
new file mode 100644
index 0000000..5abc7a5
--- /dev/null
+++ b/src/util/path.cc
@@ -0,0 +1,141 @@
+/*
+ * Functions manipulating filenames and paths
+ *
+ * by WangLu
+ * 2012.11.29
+ */
+
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <cstring>
+
+#include "path.h"
+
+#ifdef __MINGW32__
+#include "util/mingw.h"
+#endif
+
+using std::string;
+
+namespace pdf2htmlEX {
+
+void create_directories(const string & path)
+{
+    if(path.empty()) return;
+
+    size_t idx = path.rfind('/');
+    if(idx != string::npos)
+    {
+        create_directories(path.substr(0, idx));
+    }
+
+    int r = mkdir(path.c_str(), S_IRWXU);
+    if(r != 0)
+    {
+        if(errno == EEXIST)
+        {
+            struct stat stat_buf;
+            if((stat(path.c_str(), &stat_buf) == 0) && S_ISDIR(stat_buf.st_mode))
+                return;
+        }
+
+        throw string("Cannot create directory: ") + path;
+    }
+}
+
+bool sanitize_filename(string & filename)
+{
+    string sanitized;
+    bool format_specifier_found = false;
+
+    for(size_t i = 0; i < filename.size(); i++)
+    {
+        if('%' == filename[i])
+        {
+            if(format_specifier_found)
+            {
+                sanitized.push_back('%');
+                sanitized.push_back('%');
+            }
+            else
+            {
+                // We haven't found the format specifier yet, so see if we can use this one as a valid formatter
+                size_t original_i = i;
+                string tmp;
+                tmp.push_back('%');
+                while(++i < filename.size())
+                {
+                    tmp.push_back(filename[i]);
+
+                    // If we aren't still in option specifiers, stop looking
+                    if(!strchr("0123456789", filename[i]))
+                    {
+                        break;
+                    }
+                }
+
+                // Check to see if we yielded a valid format specifier
+                if('d' == tmp[tmp.size()-1])
+                {
+                    // Found a valid integer format
+                    sanitized.append(tmp);
+                    format_specifier_found = true;
+                }
+                else
+                {
+                    // Not a valid format specifier. Just append the protected %
+                    // and keep looking from where we left of in the search
+                    sanitized.push_back('%');
+                    sanitized.push_back('%');
+                    i = original_i;
+                }
+            }
+        }
+        else
+        {
+            sanitized.push_back(filename[i]);
+        }
+    }
+
+    // Only sanitize if it is a valid format.
+    if(format_specifier_found)
+    {
+        filename.assign(sanitized);
+    }
+
+    return format_specifier_found;
+}
+
+bool is_truetype_suffix(const string & suffix)
+{
+    return (suffix == ".ttf") || (suffix == ".ttc") || (suffix == ".otf");
+}
+
+string get_filename (const string & path)
+{
+    size_t idx = path.rfind('/');
+    if(idx == string::npos)
+        return path;
+    else if (idx == path.size() - 1)
+        return "";
+    return path.substr(idx + 1);
+}
+
+string get_suffix(const string & path)
+{
+    string fn = get_filename(path);
+    size_t idx = fn.rfind('.');
+    if(idx == string::npos)
+        return "";
+    else
+    {
+        string s = fn.substr(idx);
+        for(auto & c : s)
+            c = tolower(c);
+        return s;
+    }
+}
+
+
+} //namespace pdf2htmlEX
diff --git a/src/util/path.h b/src/util/path.h
new file mode 100644
index 0000000..2a2a685
--- /dev/null
+++ b/src/util/path.h
@@ -0,0 +1,33 @@
+/*
+ * Function handling filenames and paths
+ *
+ * by WangLu
+ * 2012.11.29
+ */
+
+#ifndef PATH_H__
+#define PATH_H__
+
+#include <string>
+
+namespace pdf2htmlEX {
+
+void create_directories(const std::string & path);
+
+bool is_truetype_suffix(const std::string & suffix);
+
+std::string get_filename(const std::string & path);
+std::string get_suffix(const std::string & path);
+
+/**
+ * Sanitize all occurrences of '%' except for the first valid format specifier. Filename
+ * is only sanitized if a formatter is found, and the function returns true.
+ *
+ * @param filename the filename to be sanitized. Value will be modified.
+ *
+ * @return true if a format specifier was found, false otherwise.
+ */ 
+bool sanitize_filename(std::string & filename);
+
+} //namespace pdf2htmlEX 
+#endif //PATH_H__
diff --git a/src/util/unicode.cc b/src/util/unicode.cc
new file mode 100644
index 0000000..4a2a034
--- /dev/null
+++ b/src/util/unicode.cc
@@ -0,0 +1,70 @@
+/*
+ * Unicode manipulation functions
+ *
+ * Copyright (C) 2012-2014 Lu Wang <coolwanglu@gmail.com>
+ */
+
+#include <iostream>
+
+#include <GlobalParams.h>
+
+#include "pdf2htmlEX-config.h"
+
+#include "unicode.h"
+
+namespace pdf2htmlEX {
+
+using std::cerr;
+using std::endl;
+using std::ostream;
+
+Unicode map_to_private(CharCode code)
+{
+    Unicode private_mapping = (Unicode)(code + 0xE000);
+    if(private_mapping > 0xF8FF)
+    {
+        private_mapping = (Unicode)((private_mapping - 0xF8FF) + 0xF0000);
+        if(private_mapping > 0xFFFFD)
+        {
+            private_mapping = (Unicode)((private_mapping - 0xFFFFD) + 0x100000);
+            if(private_mapping > 0x10FFFD)
+            {
+                cerr << "Warning: all private use unicode are used" << endl;
+            }
+        }
+    }
+    return private_mapping;
+}
+
+Unicode unicode_from_font (CharCode code, GfxFont * font)
+{
+    if(!font->isCIDFont())
+    {
+        char * cname = dynamic_cast<Gfx8BitFont*>(font)->getCharName(code);
+        // may be untranslated ligature
+        if(cname)
+        {
+            Unicode ou = globalParams->mapNameToUnicodeText(cname);
+            if(!is_illegal_unicode(ou))
+                return ou;
+        }
+    }
+
+    return map_to_private(code);
+}
+
+Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font)
+{
+    if(len == 0)
+        return map_to_private(code);
+
+    if(len == 1)
+    {
+        if(!is_illegal_unicode(*u))
+            return *u;
+    }
+
+    return unicode_from_font(code, font);
+}
+
+} //namespace pdf2htmlEX
diff --git a/src/util/unicode.h b/src/util/unicode.h
new file mode 100644
index 0000000..2100695
--- /dev/null
+++ b/src/util/unicode.h
@@ -0,0 +1,84 @@
+/*
+ * Unicode manipulation functions
+ *
+ * by WangLu
+ * 2012.11.29
+ */
+
+#ifndef UNICODE_H__
+#define UNICODE_H__
+
+#include <GfxFont.h>
+#include <CharTypes.h>
+
+namespace pdf2htmlEX {
+
+/**
+ * Check whether a unicode character is illegal for the output HTML.
+ * Unlike PDF readers, browsers has special treatments for such characters (normally treated as
+ * zero-width space), regardless of metrics and glyphs provided by fonts. So these characters
+ * should be mapped to unicode private area to "cheat" browsers, at the cost of loosing actual
+ * unicode values in the HTML.
+ *
+ * The following chart shows illegal characters  in HTML by webkit, mozilla, and pdf2htmlEX (p2h).
+ * pdf2htmlEX's illegal character set is the union of webkit's and mozilla's, plus illegal unicode
+ * characters. "[" and ")" surrounding ranges denote "inclusive" and "exclusive", respectively.
+ *
+ *         00(NUL)--09(\t)--0A(\n)--0D(\r)--20(SP)--7F(DEL)--9F(APC)--A0(NBSP)--AD(SHY)--061C(ALM)--1361(Ethiopic word space)
+ * webkit:   [--------------------------------)        [------------------)       [-]
+ * moz:      [--------------------------------)        [---------]                          [-]
+ * p2h:      [--------------------------------)        [------------------]       [-]       [-]         [-]
+ *
+ *         200B(ZWSP)--200C(ZWNJ)--200D(ZWJ)--200E(LRM)--200F(RLM)--2028(LSEP)--2029(PSEP)--202A(LRE)--202E(RL0)--2066(LRI)--2069(PDI)
+ * webkit:   [-----------------------------------------------]                                 [----------]
+ * moz:      [-]                                  [----------]         [-]         [-]         [----------]         [------------]
+ * p2h:      [-----------------------------------------------]         [-]         [-]         [----------]         [------------]
+ *
+ *         D800(surrogate)--DFFF(surrogate)--FEFF(ZWNBSP)--FFFC(ORC)--FFFE(non-char)--FFFF(non-char)
+ * webkit:                                      [-]           [-]
+ * moz:
+ * p2h:         [------------------]            [-]           [-]          [-----------------]
+ *
+ * Note: 0xA0 (no-break space) affects word-spacing; and if "white-space:pre" is specified,
+ * \n and \r can break line, \t can shift text, so they are considered illegal.
+ *
+ * Resources (retrieved at 2015-03-16)
+ * * webkit
+ *   * Avoid querying the font cache for the zero-width space glyph ( https://bugs.webkit.org/show_bug.cgi?id=90673 )
+ *   * treatAsZeroWidthSpace( https://github.com/WebKit/webkit/blob/17bbff7400393e9389b40cc84ce005f7cc954680/Source/WebCore/platform/graphics/FontCascade.h#L272 )
+ * * mozilla
+ *   * IsInvalidChar( http://mxr.mozilla.org/mozilla-central/source/gfx/thebes/gfxTextRun.cpp#1973 )
+ *   * IsBidiControl( http://mxr.mozilla.org/mozilla-central/source/intl/unicharutil/util/nsBidiUtils.h#114 )
+ * * Character encodings in HTML ( http://en.wikipedia.org/wiki/Character_encodings_in_HTML#HTML_character_references )
+ * * CSS Text Spec ( http://dev.w3.org/csswg/css-text/ )
+ * * unicode table ( http://unicode-table.com )
+ *
+ * TODO Web specs? IE?
+ *
+ */
+inline bool is_illegal_unicode(Unicode c)
+{
+    return (c < 0x20) || (c >= 0x7F && c <= 0xA0) || (c == 0xAD)
+            || (c == 0x061C) || (c == 0x1361)
+            || (c >= 0x200B && c <= 0x200F) || (c == 0x2028) || (c == 0x2029)
+            || (c >= 0x202A && c <= 0x202E) || (c >= 0x2066 && c <= 0x2069)
+            || (c >= 0xD800 && c <= 0xDFFF) || (c == 0xFEFF) || (c == 0xFFFC)
+            || (c == 0xFFFE) || (c == 0xFFFF);
+}
+
+Unicode map_to_private(CharCode code);
+
+/* * Try to determine the Unicode value directly from the information in the font */
+Unicode unicode_from_font (CharCode code, GfxFont * font);
+
+/*
+ * We have to use a single Unicode value to reencode fonts
+ * if we got multi-unicode values, it might be expanded ligature, try to restore it
+ * if we cannot figure it out at the end, use a private mapping
+ */
+Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font);
+
+
+} // namespace pdf2htmlEX
+
+#endif //UNICODE_H__
author	Johannes Schauer <josch@debian.org>	2015-07-27 16:07:02 +0200
committer	Johannes Schauer <josch@debian.org>	2015-07-27 16:07:02 +0200
commit	385b4eca34c290f112d90e74925ba1963a4e0a94 (patch)
tree	5b23566049318adbdd0d26c82735fa9b4072aae5 /src