diff options
author | Johannes Schauer <josch@debian.org> | 2015-07-27 16:07:02 +0200 |
---|---|---|
committer | Johannes Schauer <josch@debian.org> | 2015-07-27 16:07:02 +0200 |
commit | 385b4eca34c290f112d90e74925ba1963a4e0a94 (patch) | |
tree | 5b23566049318adbdd0d26c82735fa9b4072aae5 /src |
Import pdf2htmlex_0.14.6+ds.orig.tar.gz
[dgit import orig pdf2htmlex_0.14.6+ds.orig.tar.gz]
Diffstat (limited to 'src')
60 files changed, 9571 insertions, 0 deletions
diff --git a/src/ArgParser.cc b/src/ArgParser.cc new file mode 100644 index 0000000..19dcf32 --- /dev/null +++ b/src/ArgParser.cc @@ -0,0 +1,176 @@ +/* + * A wrapper of getopt + * + * by WangLu + * 2012.09.10 + */ + +#include <iostream> +#include <unordered_map> +#include <cassert> + +#include <getopt.h> + +#include "ArgParser.h" + +namespace pdf2htmlEX { + +using std::ostream; +using std::cerr; +using std::endl; +using std::string; +using std::vector; +using std::unordered_map; +using std::make_pair; +using std::ostringstream; + +bool read_value(const char * arg, char * location) +{ + *location = arg[0]; + return (arg[1] == 0); +} + +bool read_value(const char * arg, std::string * location) +{ + *location = std::string(arg); + return true; +} + +void dump_value(std::ostream & out, const std::string & v) +{ + out << '"' << v << '"'; +} + +ArgParser & ArgParser::add(const char * optname, const char * description, ArgParserCallBack callback, bool need_arg) +{ + // ArgEntry does not accept nullptr as optname nor description + if((!optname) || (!optname[0])) + { + // when optname is nullptr or "", it's optional, and description is dropped + optional_arg_entries.emplace_back(new ArgEntry<string, string>("", "", callback, need_arg)); + } + else + { + arg_entries.emplace_back(new ArgEntry<string, string>(optname, (description ? description : ""), callback, need_arg)); + } + + return *this; +} + +void ArgParser::parse(int argc, char ** argv) const +{ + //prepare optstring and longopts + vector<char> optstring; + optstring.reserve(2*arg_entries.size() + 1); + vector<struct option> longopts; + longopts.reserve(arg_entries.size() + 1); + + unordered_map<int, const ArgEntryBase*> opt_map; + + for(auto iter = arg_entries.begin(); iter != arg_entries.end(); ++iter) + { + const auto * p = iter->get(); + if(p->shortname != 0) + { + optstring.push_back(p->shortname); + if(p->need_arg) + optstring.push_back(':'); + + int v = p->shortname; + if(!(opt_map.insert(make_pair(v, p)).second)) + { + cerr << "Warning: duplicated shortname: " << v << endl; + } + } + + if(p->name != "") + { + int v = (256 + (iter - arg_entries.begin())); + longopts.resize(longopts.size() + 1); + { + auto & cur = longopts.back(); + cur.name = p->name.c_str(); + cur.has_arg = ((p->need_arg) ? required_argument : no_argument); + cur.flag = nullptr; + cur.val = v; + } + if(!(opt_map.insert(make_pair(v, p)).second)) + { + cerr << "Warning: duplicated long name: " << (p->name) << endl; + } + } + } + + optstring.push_back(0); + longopts.resize(longopts.size() + 1); + { + auto & cur = longopts.back(); + cur.name = 0; + cur.has_arg = 0; + cur.flag = 0; + cur.val = 0; + } + + { + opterr = 1; + int r; + int idx; + while(true) + { + r = getopt_long(argc, argv, &optstring.front(), &longopts.front(), &idx); + if(r == -1) + break; + assert(r != ':'); + if(r == '?') + { + throw ""; + } + + auto iter = opt_map.find(r); + assert(iter != opt_map.end()); + iter->second->parse(optarg); + } + } + + { + auto iter = optional_arg_entries.begin(); + while((optind < argc) && (iter != optional_arg_entries.end())) + { + (*(iter++))->parse(argv[optind++]); + } + } +} + +void ArgParser::show_usage(ostream & out) const +{ + for(auto & entry : arg_entries) + { + entry->show_usage(out); + } +} + +template<> const char * ArgParser::get_type_name<int> (void) { return "int"; } +template<> const char * ArgParser::get_type_name<double> (void) { return "fp"; } +template<> const char * ArgParser::get_type_name<string> (void) { return "string"; } + +ArgParser::ArgEntryBase::ArgEntryBase(const char * name, const char * description, bool need_arg) + : shortname(0), name(name), description(description), need_arg(need_arg) +{ + size_t idx = this->name.rfind(','); + if(idx != string::npos) + { + if(idx+2 == this->name.size()) + { + shortname = this->name[this->name.size()-1]; + this->name = this->name.substr(0, idx); + } + else + { + cerr << "Warning: argument '" << this->name << "' cannot be parsed as a short option" << endl; + } + } +} + +const int ArgParser::arg_col_width = 31; + +} // namespace pdf2htmlEX diff --git a/src/ArgParser.h b/src/ArgParser.h new file mode 100644 index 0000000..c0f8cde --- /dev/null +++ b/src/ArgParser.h @@ -0,0 +1,219 @@ +/* + * A wrapper of getopt + * + * by WangLu + * 2012.09.10 + */ + + +#ifndef ARGPARSER_H__ +#define ARGPARSER_H__ + +#include <string> +#include <vector> +#include <ostream> +#include <sstream> +#include <memory> + +#ifndef nullptr +#define nullptr (NULL) +#endif + +namespace pdf2htmlEX { + +//helper +template<class T> +bool read_value(const char * arg, T * location) +{ + std::istringstream sin(arg); + return ((sin >> (*location)) && (sin.eof())); +} + +extern bool read_value(const char * arg, char * location); +extern bool read_value(const char * arg, std::string * location); + +template<class T> +void dump_value(std::ostream & out, const T & v) +{ + out << v; +} + +extern void dump_value(std::ostream & out, const std::string & v); + +class ArgParser +{ +public: + typedef void (*ArgParserCallBack) (const char * arg); + + /* + * The 1st is for arguments with callbacks(i.e. flags) + * The 2nd is for arguments linked to variables + * + * optname: + * - if not nullptr, it should be the name of the arg, should be in the format of "<long name>[,<short char>]", e.g. "help,h" + * - if nullptr, it denotes an optional arg, and description will be ignored + * description: + * - if description is nullptr or "", the argument won't be shown in show_usage() + * + * location: + * - if not nullptr, the argument for this arg is stored there + * - if nullptr, this arg does not need arguments + */ + ArgParser & add(const char * optname, const char * description, ArgParserCallBack callback, bool need_arg = false); + template <class T, class Tv> + ArgParser & add(const char * optname, T * location, const Tv & default_value, const char * description, bool dont_show_default = false); + + void parse(int argc, char ** argv) const; + void show_usage(std::ostream & out) const; + +private: + // type names helper + template<class> + static const char * get_type_name(void) { return "unknown"; } + + struct ArgEntryBase + { + /* name or description cannot be nullptr */ + ArgEntryBase(const char * name, const char * description, bool need_arg); + virtual ~ArgEntryBase() { } + char shortname; + std::string name; + std::string description; + bool need_arg; + virtual void parse (const char * arg) const = 0; + virtual void show_usage (std::ostream & out) const = 0; + }; + + template <class T, class Tv> + struct ArgEntry : public ArgEntryBase + { + ArgEntry(const char * name, + const char * description, + ArgParserCallBack callback, + bool need_arg); + + ArgEntry(const char * name, + T * location, const Tv & default_value, + const char * description, bool dont_show_default); + + virtual void parse (const char * arg) const; + virtual void show_usage (std::ostream & out) const; + + private: + T * location; + T default_value; + ArgParserCallBack callback; + bool dont_show_default; + }; + + std::vector<std::unique_ptr<ArgEntryBase>> arg_entries, optional_arg_entries; + static const int arg_col_width; +}; + +template<class T, class Tv> +ArgParser & ArgParser::add(const char * optname, T * location, const Tv & default_value, const char * description, bool dont_show_default) +{ + // ArgEntry does not accept nullptr as optname nor description + if((!optname) || (!optname[0])) + { + // when optname is nullptr or "", it's optional, and description is dropped + optional_arg_entries.emplace_back(new ArgEntry<T, Tv>("", location, default_value, "", dont_show_default)); + } + else + { + arg_entries.emplace_back(new ArgEntry<T, Tv>(optname, location, default_value, (description ? description : ""), dont_show_default)); + } + + return *this; +} + +// Known types +template<> const char * ArgParser::get_type_name<int> (void); +template<> const char * ArgParser::get_type_name<double> (void); +template<> const char * ArgParser::get_type_name<std::string> (void); + +template<class T, class Tv> +ArgParser::ArgEntry<T, Tv>::ArgEntry(const char * name, const char * description, ArgParserCallBack callback, bool need_arg) + : ArgEntryBase(name, description, need_arg) + , location(nullptr) + , default_value() + , callback(callback) + , dont_show_default(true) +{ +} + +template<class T, class Tv> +ArgParser::ArgEntry<T, Tv>::ArgEntry(const char * name, T * location, const Tv & default_value, const char * description, bool dont_show_default) + : ArgEntryBase(name, description, (location != nullptr)) + , location(location) + , default_value(default_value) + , callback(nullptr) + , dont_show_default(dont_show_default) +{ + if(need_arg) + *location = T(default_value); +} + +template<class T, class Tv> +void ArgParser::ArgEntry<T, Tv>::parse(const char * arg) const +{ + if(need_arg) + { + if(!arg) + throw std::string("Missing argument of option: --") + name; + + if((location != nullptr) && (!read_value(arg, location))) + throw std::string("Invalid argument: ") + arg; + } + + if(callback) + (*callback)(arg); +} + +template<class T, class Tv> +void ArgParser::ArgEntry<T, Tv>::show_usage(std::ostream & out) const +{ + if(description.empty()) + return; + + std::ostringstream sout; + sout << " "; + + if(shortname != 0) + { + sout << "-" << shortname; + } + + if(name != "") + { + if(shortname != 0) + sout << ","; + sout << "--" << name; + } + + if(need_arg) + { + sout << " <" << get_type_name<T>() << ">"; + } + + std::string s = sout.str(); + out << s; + + for(int i = s.size(); i < arg_col_width; ++i) + out << ' '; + + out << " " << description; + + if(need_arg && !dont_show_default) + { + out << " (default: "; + dump_value(out, default_value); + out << ")"; + } + + out << std::endl; +} + +} // namespace ArgParser + +#endif //ARGPARSER_H__ diff --git a/src/BackgroundRenderer/BackgroundRenderer.cc b/src/BackgroundRenderer/BackgroundRenderer.cc new file mode 100644 index 0000000..dbd7137 --- /dev/null +++ b/src/BackgroundRenderer/BackgroundRenderer.cc @@ -0,0 +1,130 @@ +/* + * Background renderer + * Render all those things not supported as Image + * + * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com> + */ + +#include <poppler-config.h> + +#include "HTMLRenderer/HTMLRenderer.h" +#include "Param.h" + +#include "BackgroundRenderer.h" +#include "SplashBackgroundRenderer.h" +#if ENABLE_SVG +#include "CairoBackgroundRenderer.h" +#endif + +namespace pdf2htmlEX { + +std::unique_ptr<BackgroundRenderer> BackgroundRenderer::getBackgroundRenderer(const std::string & format, HTMLRenderer * html_renderer, const Param & param) +{ +#ifdef ENABLE_LIBPNG + if(format == "png") + { + return std::unique_ptr<BackgroundRenderer>(new SplashBackgroundRenderer(format, html_renderer, param)); + } +#endif +#ifdef ENABLE_LIBJPEG + if(format == "jpg") + { + return std::unique_ptr<BackgroundRenderer>(new SplashBackgroundRenderer(format, html_renderer, param)); + } +#endif +#if ENABLE_SVG + if (format == "svg") + { + return std::unique_ptr<BackgroundRenderer>(new CairoBackgroundRenderer(html_renderer, param)); + } +#endif + + return nullptr; +} + +std::unique_ptr<BackgroundRenderer> BackgroundRenderer::getFallbackBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param) +{ + if (param.bg_format == "svg" && param.svg_node_count_limit >= 0) + return std::unique_ptr<BackgroundRenderer>(new SplashBackgroundRenderer("", html_renderer, param)); + return nullptr; +} + +void BackgroundRenderer::proof_begin_text_object(GfxState *state, OutputDev * dev) +{ + if (!proof_state) + { + PDFRectangle rect(0, 0, state->getPageWidth(), state->getPageHeight()); + proof_state.reset(new GfxState(state->getHDPI(), state->getVDPI(), &rect, state->getRotate(), dev->upsideDown())); + proof_state->setFillColorSpace(new GfxDeviceRGBColorSpace()); + proof_state->setStrokeColorSpace(new GfxDeviceRGBColorSpace()); + } + + // Save original render mode in proof_state, and restore in proof_end_text_object() + // This is due to poppler's OutputDev::updateRender() actually has no effect, we have to + // modify state directly, see proof_begin_string(). + proof_state->setRender(state->getRender()); +} + +void BackgroundRenderer::proof_begin_string(GfxState *state, OutputDev * dev) +{ + int render = proof_state->getRender(); + if (render == 3) // hidden + return; + + double lx = state->getFontSize() / 70, ly = lx; + tm_transform(state->getTextMat(), lx, ly, true); + proof_state->setLineWidth(sqrt(lx * lx + ly * ly)); + + static const Color red(1, 0, 0), green(0, 1, 0), blue(0, 0, 1), yellow(1, 1, 0), white(1, 1, 1); + Color fc, sc; + const Color *pfc, *psc; + state->getFillRGB(&fc.rgb); + state->getStrokeRGB(&sc.rgb); + + if (render == 0 || render == 2) //has fill + pfc = fc.distance(red) > 0.4 ? &red : &green; + else + pfc = &red; + + if (render == 1 || render == 2) // has stroke + psc = sc.distance(blue) > 0.4 ? &blue : &yellow; + else if(render == 0) // fill only + psc = &white; + else + psc = &blue; + + GfxColor gfc, gsc; + pfc->get_gfx_color(gfc); + psc->get_gfx_color(gsc); + proof_state->setFillColor(&gfc); + proof_state->setStrokeColor(&gsc); + + if (state->getFillColorSpace()->getMode() != csDeviceRGB) + dev->updateFillColorSpace(proof_state.get()); + if (state->getStrokeColorSpace()->getMode() != csDeviceRGB) + dev->updateStrokeColorSpace(proof_state.get()); + + dev->updateLineWidth(proof_state.get()); + dev->updateFillColor(proof_state.get()); + dev->updateStrokeColor(proof_state.get()); + + state->setRender(2); // fill & stroke +} + +void BackgroundRenderer::proof_end_text_object(GfxState *state, OutputDev * dev) +{ + state->setRender(proof_state->getRender()); + dev->updateLineWidth(state); + dev->updateFillColorSpace(state); + dev->updateStrokeColorSpace(state); + dev->updateFillColor(state); + dev->updateStrokeColor(state); +} + +void BackgroundRenderer::proof_update_render(GfxState *state, OutputDev * dev) +{ + // Save render mode in proof_state in cases it is changed inside a text object + proof_state->setRender(state->getRender()); +} + +} // namespace pdf2htmlEX diff --git a/src/BackgroundRenderer/BackgroundRenderer.h b/src/BackgroundRenderer/BackgroundRenderer.h new file mode 100644 index 0000000..2927484 --- /dev/null +++ b/src/BackgroundRenderer/BackgroundRenderer.h @@ -0,0 +1,52 @@ +/* + * Background renderer + * Render all those things not supported as Image + * + * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com> + */ + + +#ifndef BACKGROUND_RENDERER_H__ +#define BACKGROUND_RENDERER_H__ + +#include <string> +#include <memory> + +class PDFDoc; +class GfxState; +class OutputDev; + +namespace pdf2htmlEX { + +class Param; +class HTMLRenderer; +class BackgroundRenderer +{ +public: + // return nullptr upon failure + static std::unique_ptr<BackgroundRenderer> getBackgroundRenderer(const std::string & format, HTMLRenderer * html_renderer, const Param & param); + // Return a fallback bg renderer according to param.bg_format. + // Currently only svg bg format might need a bitmap fallback. + static std::unique_ptr<BackgroundRenderer> getFallbackBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param); + + BackgroundRenderer() {} + virtual ~BackgroundRenderer() {} + + virtual void init(PDFDoc * doc) = 0; + //return true on success, false otherwise (e.g. need a fallback) + virtual bool render_page(PDFDoc * doc, int pageno) = 0; + virtual void embed_image(int pageno) = 0; + + // for proof output +protected: + void proof_begin_text_object(GfxState * state, OutputDev * dev); + void proof_begin_string(GfxState * state, OutputDev * dev); + void proof_end_text_object(GfxState * state, OutputDev * dev); + void proof_update_render(GfxState * state, OutputDev * dev); +private: + std::unique_ptr<GfxState> proof_state; +}; + +} // namespace pdf2htmlEX + +#endif //BACKGROUND_RENDERER_H__ diff --git a/src/BackgroundRenderer/CairoBackgroundRenderer.cc b/src/BackgroundRenderer/CairoBackgroundRenderer.cc new file mode 100644 index 0000000..1ce6eac --- /dev/null +++ b/src/BackgroundRenderer/CairoBackgroundRenderer.cc @@ -0,0 +1,311 @@ +/* + * CairoBackgroundRenderer.cc + * + * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com> + */ + +#include <string> +#include <fstream> + + +#include "pdf2htmlEX-config.h" + +#include "Base64Stream.h" + +#if ENABLE_SVG + +#include "CairoBackgroundRenderer.h" +#include "SplashBackgroundRenderer.h" + +namespace pdf2htmlEX { + +using std::string; +using std::ifstream; +using std::ofstream; +using std::vector; +using std::unordered_map; + +CairoBackgroundRenderer::CairoBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param) + : CairoOutputDev() + , html_renderer(html_renderer) + , param(param) + , surface(nullptr) +{ } + +CairoBackgroundRenderer::~CairoBackgroundRenderer() +{ + for(auto const& p : bitmaps_ref_count) + { + if (p.second == 0) + { + html_renderer->tmp_files.add(this->build_bitmap_path(p.first)); + } + } +} + +void CairoBackgroundRenderer::drawChar(GfxState *state, double x, double y, + double dx, double dy, + double originX, double originY, + CharCode code, int nBytes, Unicode *u, int uLen) +{ + // draw characters as image when + // - in fallback mode + // - OR there is special filling method + // - OR using a writing mode font + // - OR using a Type 3 font while param.process_type3 is not enabled + // - OR the text is used as path + if((param.fallback || param.proof) + || ( (state->getFont()) + && ( (state->getFont()->getWMode()) + || ((state->getFont()->getType() == fontType3) && (!param.process_type3)) + || (state->getRender() >= 4) + ) + ) + ) + { + CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen); + } + // If a char is treated as image, it is not subject to cover test + // (see HTMLRenderer::drawString), so don't increase drawn_char_count. + else if (param.correct_text_visibility) { + if (html_renderer->is_char_covered(drawn_char_count)) + CairoOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen); + drawn_char_count++; + } +} + +void CairoBackgroundRenderer::beginTextObject(GfxState *state) +{ + if (param.proof == 2) + proof_begin_text_object(state, this); + CairoOutputDev::beginTextObject(state); +} + +void CairoBackgroundRenderer::beginString(GfxState *state, GooString * str) +{ + if (param.proof == 2) + proof_begin_string(state, this); + CairoOutputDev::beginString(state, str); +} + +void CairoBackgroundRenderer::endTextObject(GfxState *state) +{ + if (param.proof == 2) + proof_end_text_object(state, this); + CairoOutputDev::endTextObject(state); +} + +void CairoBackgroundRenderer::updateRender(GfxState *state) +{ + if (param.proof == 2) + proof_update_render(state, this); + CairoOutputDev::updateRender(state); +} + +void CairoBackgroundRenderer::init(PDFDoc * doc) +{ + startDoc(doc); +} + +static GBool annot_cb(Annot *, void * pflag) { + return (*((bool*)pflag)) ? gTrue : gFalse; +}; + +bool CairoBackgroundRenderer::render_page(PDFDoc * doc, int pageno) +{ + drawn_char_count = 0; + double page_width; + double page_height; + if(param.use_cropbox) + { + page_width = doc->getPageCropWidth(pageno); + page_height = doc->getPageCropHeight(pageno); + } + else + { + page_width = doc->getPageMediaWidth(pageno); + page_height = doc->getPageMediaHeight(pageno); + } + + if (doc->getPageRotate(pageno) == 90 || doc->getPageRotate(pageno) == 270) + std::swap(page_height, page_width); + + string fn = (char*)html_renderer->str_fmt("%s/bg%x.svg", (param.embed_image ? param.tmp_dir : param.dest_dir).c_str(), pageno); + if(param.embed_image) + html_renderer->tmp_files.add(fn); + + surface = cairo_svg_surface_create(fn.c_str(), page_width * param.h_dpi / DEFAULT_DPI, page_height * param.v_dpi / DEFAULT_DPI); + cairo_svg_surface_restrict_to_version(surface, CAIRO_SVG_VERSION_1_2); + cairo_surface_set_fallback_resolution(surface, param.h_dpi, param.v_dpi); + + cairo_t * cr = cairo_create(surface); + setCairo(cr); + + bitmaps_in_current_page.clear(); + + bool process_annotation = param.process_annotation; + doc->displayPage(this, pageno, param.h_dpi, param.v_dpi, + 0, + (!(param.use_cropbox)), + false, + false, + nullptr, nullptr, &annot_cb, &process_annotation); + + setCairo(nullptr); + + { + auto status = cairo_status(cr); + cairo_destroy(cr); + if(status) + throw string("Cairo error: ") + cairo_status_to_string(status); + } + + cairo_surface_finish(surface); + { + auto status = cairo_surface_status(surface); + cairo_surface_destroy(surface); + surface = nullptr; + if(status) + throw string("Error in cairo: ") + cairo_status_to_string(status); + } + + //check node count in the svg file, fall back to bitmap_renderer if necessary. + if (param.svg_node_count_limit >= 0) + { + int n = 0; + char c; + ifstream svgfile(fn); + //count of '<' in the file should be an approximation of node count. + while(svgfile >> c) + { + if (c == '<') + ++n; + if (n > param.svg_node_count_limit) + { + html_renderer->tmp_files.add(fn); + return false; + } + } + } + + // the svg file is actually used, so add its bitmaps' ref count. + for (auto id : bitmaps_in_current_page) + ++bitmaps_ref_count[id]; + + return true; +} + +void CairoBackgroundRenderer::embed_image(int pageno) +{ + auto & f_page = *(html_renderer->f_curpage); + + // SVGs introduced by <img> or background-image can't have external resources; + // SVGs introduced by <embed> and <object> can, but they are more expensive for browsers. + // So we use <img> if the SVG contains no external bitmaps, and use <embed> otherwise. + // See also: + // https://developer.mozilla.org/en-US/docs/Web/SVG/SVG_as_an_Image + // http://stackoverflow.com/questions/4476526/do-i-use-img-object-or-embed-for-svg-files + + if (param.svg_embed_bitmap || bitmaps_in_current_page.empty()) + f_page << "<img"; + else + f_page << "<embed"; + + f_page << " class=\"" << CSS::FULL_BACKGROUND_IMAGE_CN + << "\" alt=\"\" src=\""; + + if(param.embed_image) + { + auto path = html_renderer->str_fmt("%s/bg%x.svg", param.tmp_dir.c_str(), pageno); + ifstream fin((char*)path, ifstream::binary); + if(!fin) + throw string("Cannot read background image ") + (char*)path; + f_page << "data:image/svg+xml;base64," << Base64Stream(fin); + } + else + { + f_page << (char*)html_renderer->str_fmt("bg%x.svg", pageno); + } + f_page << "\"/>"; +} + +string CairoBackgroundRenderer::build_bitmap_path(int id) +{ + // "o" for "PDF Object" + return string(html_renderer->str_fmt("%s/o%d.jpg", param.dest_dir.c_str(), id)); +} +// Override CairoOutputDev::setMimeData() and dump bitmaps in SVG to external files. +void CairoBackgroundRenderer::setMimeData(Stream *str, Object *ref, cairo_surface_t *image) +{ + if (param.svg_embed_bitmap) + { + CairoOutputDev::setMimeData(str, ref, image); + return; + } + + // TODO dump bitmaps in other formats. + if (str->getKind() != strDCT) + return; + + // TODO inline image? + if (ref == nullptr || !ref->isRef()) + return; + + // We only dump rgb or gray jpeg without /Decode array. + // + // Although jpeg support CMYK, PDF readers do color conversion incompatibly with most other + // programs (including browsers): other programs invert CMYK color if 'Adobe' marker (app14) presents + // in a jpeg file; while PDF readers don't, they solely rely on /Decode array to invert color. + // It's a bit complicated to decide whether a CMYK jpeg is safe to dump, so we don't dump at all. + // See also: + // JPEG file embedded in PDF (CMYK) https://forums.adobe.com/thread/975777 + // http://stackoverflow.com/questions/3123574/how-to-convert-from-cmyk-to-rgb-in-java-correctly + // + // In PDF, jpeg stream objects can also specify other color spaces like DeviceN and Separation, + // It is also not safe to dump them directly. + Object obj; + str->getDict()->lookup("ColorSpace", &obj); + if (!obj.isName() || (strcmp(obj.getName(), "DeviceRGB") && strcmp(obj.getName(), "DeviceGray")) ) + { + obj.free(); + return; + } + obj.free(); + str->getDict()->lookup("Decode", &obj); + if (obj.isArray()) + { + obj.free(); + return; + } + obj.free(); + + int imgId = ref->getRef().num; + auto uri = strdup((char*) html_renderer->str_fmt("o%d.jpg", imgId)); + auto st = cairo_surface_set_mime_data(image, CAIRO_MIME_TYPE_URI, + (unsigned char*) uri, strlen(uri), free, uri); + if (st) + { + free(uri); + return; + } + bitmaps_in_current_page.push_back(imgId); + + if(bitmaps_ref_count.find(imgId) != bitmaps_ref_count.end()) + return; + + bitmaps_ref_count[imgId] = 0; + + char *strBuffer; + int len; + if (getStreamData(str->getNextStream(), &strBuffer, &len)) + { + ofstream imgfile(build_bitmap_path(imgId), ofstream::binary); + imgfile.write(strBuffer, len); + free(strBuffer); + } +} + +} // namespace pdf2htmlEX + +#endif // ENABLE_SVG + diff --git a/src/BackgroundRenderer/CairoBackgroundRenderer.h b/src/BackgroundRenderer/CairoBackgroundRenderer.h new file mode 100644 index 0000000..4ed9c86 --- /dev/null +++ b/src/BackgroundRenderer/CairoBackgroundRenderer.h @@ -0,0 +1,75 @@ +/* + * Cairo Background renderer + * Render all those things not supported as Image, with Cairo + * + * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com> + */ + + +#ifndef CAIRO_BACKGROUND_RENDERER_H__ +#define CAIRO_BACKGROUND_RENDERER_H__ + +#include <CairoOutputDev.h> +#include <cairo.h> +#include <cairo-svg.h> +#include <unordered_map> +#include <vector> +#include <string> + +#include "pdf2htmlEX-config.h" + +#include "Param.h" +#include "HTMLRenderer/HTMLRenderer.h" + +namespace pdf2htmlEX { + +// Based on BackgroundRenderer from poppler +class CairoBackgroundRenderer : public BackgroundRenderer, CairoOutputDev +{ +public: + CairoBackgroundRenderer(HTMLRenderer * html_renderer, const Param & param); + + virtual ~CairoBackgroundRenderer(); + + virtual void init(PDFDoc * doc); + virtual bool render_page(PDFDoc * doc, int pageno); + virtual void embed_image(int pageno); + + // Does this device use beginType3Char/endType3Char? Otherwise, + // text in Type 3 fonts will be drawn with drawChar/drawString. + virtual GBool interpretType3Chars() { return !param.process_type3; } + + virtual void drawChar(GfxState *state, double x, double y, + double dx, double dy, + double originX, double originY, + CharCode code, int nBytes, Unicode *u, int uLen); + + //for proof + void beginTextObject(GfxState *state); + void beginString(GfxState *state, GooString * str); + void endTextObject(GfxState *state); + void updateRender(GfxState *state); + +protected: + virtual void setMimeData(Stream *str, Object *ref, cairo_surface_t *image); + +protected: + HTMLRenderer * html_renderer; + const Param & param; + cairo_surface_t * surface; + +private: + // convert bitmap stream id to bitmap file name. No pageno prefix, + // because a bitmap may be shared by multiple pages. + std::string build_bitmap_path(int id); + // map<id_of_bitmap_stream, usage_count_in_all_svgs> + // note: if a svg bg fallbacks to bitmap bg, its bitmaps are not taken into account. + std::unordered_map<int, int> bitmaps_ref_count; + // id of bitmaps' stream used by current page + std::vector<int> bitmaps_in_current_page; + int drawn_char_count; +}; + +} + +#endif //CAIRO_BACKGROUND_RENDERER_H__ diff --git a/src/BackgroundRenderer/SplashBackgroundRenderer.cc b/src/BackgroundRenderer/SplashBackgroundRenderer.cc new file mode 100644 index 0000000..55b5322 --- /dev/null +++ b/src/BackgroundRenderer/SplashBackgroundRenderer.cc @@ -0,0 +1,261 @@ +/* + * SplashBackgroundRenderer.cc + * + * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com> + */ + +#include <fstream> +#include <vector> +#include <memory> + +#include <poppler-config.h> +#include <PDFDoc.h> +#include <goo/ImgWriter.h> +#include <goo/PNGWriter.h> +#include <goo/JpegWriter.h> + +#include "Base64Stream.h" +#include "util/const.h" + +#include "SplashBackgroundRenderer.h" + +namespace pdf2htmlEX { + +using std::string; +using std::ifstream; +using std::vector; +using std::unique_ptr; + +const SplashColor SplashBackgroundRenderer::white = {255,255,255}; + +SplashBackgroundRenderer::SplashBackgroundRenderer(const string & imgFormat, HTMLRenderer * html_renderer, const Param & param) + : SplashOutputDev(splashModeRGB8, 4, gFalse, (SplashColorPtr)(&white)) + , html_renderer(html_renderer) + , param(param) + , format(imgFormat) +{ + bool supported = false; +#ifdef ENABLE_LIBPNG + if (format.empty()) + format = "png"; + supported = supported || format == "png"; +#endif +#ifdef ENABLE_LIBJPEG + if (format.empty()) + format = "jpg"; + supported = supported || format == "jpg"; +#endif + if (!supported) + { + throw string("Image format not supported: ") + format; + } +} + +/* + * SplashOutputDev::startPage would paint the whole page with the background color + * And thus have modified region set to the whole page area + * We do not want that. + */ +void SplashBackgroundRenderer::startPage(int pageNum, GfxState *state, XRef *xrefA) +{ + SplashOutputDev::startPage(pageNum, state, xrefA); + clearModRegion(); +} + +void SplashBackgroundRenderer::drawChar(GfxState *state, double x, double y, + double dx, double dy, + double originX, double originY, + CharCode code, int nBytes, Unicode *u, int uLen) +{ + // draw characters as image when + // - in fallback mode + // - OR there is special filling method + // - OR using a writing mode font + // - OR using a Type 3 font while param.process_type3 is not enabled + // - OR the text is used as path + if((param.fallback || param.proof) + || ( (state->getFont()) + && ( (state->getFont()->getWMode()) + || ((state->getFont()->getType() == fontType3) && (!param.process_type3)) + || (state->getRender() >= 4) + ) + ) + ) + { + SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen); + } + // If a char is treated as image, it is not subject to cover test + // (see HTMLRenderer::drawString), so don't increase drawn_char_count. + else if (param.correct_text_visibility) { + if (html_renderer->is_char_covered(drawn_char_count)) + SplashOutputDev::drawChar(state,x,y,dx,dy,originX,originY,code,nBytes,u,uLen); + drawn_char_count++; + } +} + +void SplashBackgroundRenderer::beginTextObject(GfxState *state) +{ + if (param.proof == 2) + proof_begin_text_object(state, this); + SplashOutputDev::beginTextObject(state); +} + +void SplashBackgroundRenderer::beginString(GfxState *state, GooString * str) +{ + if (param.proof == 2) + proof_begin_string(state, this); + SplashOutputDev::beginString(state, str); +} + +void SplashBackgroundRenderer::endTextObject(GfxState *state) +{ + if (param.proof == 2) + proof_end_text_object(state, this); + SplashOutputDev::endTextObject(state); +} + +void SplashBackgroundRenderer::updateRender(GfxState *state) +{ + if (param.proof == 2) + proof_update_render(state, this); + SplashOutputDev::updateRender(state); +} + +void SplashBackgroundRenderer::init(PDFDoc * doc) +{ + startDoc(doc); +} + +static GBool annot_cb(Annot *, void * pflag) { + return (*((bool*)pflag)) ? gTrue : gFalse; +}; + +bool SplashBackgroundRenderer::render_page(PDFDoc * doc, int pageno) +{ + drawn_char_count = 0; + bool process_annotation = param.process_annotation; + doc->displayPage(this, pageno, param.h_dpi, param.v_dpi, + 0, + (!(param.use_cropbox)), + false, false, + nullptr, nullptr, &annot_cb, &process_annotation); + return true; +} + +void SplashBackgroundRenderer::embed_image(int pageno) +{ + // xmin->xmax is top->bottom + int xmin, xmax, ymin, ymax; + getModRegion(&xmin, &ymin, &xmax, &ymax); + + // dump the background image only when it is not empty + if((xmin <= xmax) && (ymin <= ymax)) + { + { + auto fn = html_renderer->str_fmt("%s/bg%x.%s", (param.embed_image ? param.tmp_dir : param.dest_dir).c_str(), pageno, format.c_str()); + if(param.embed_image) + html_renderer->tmp_files.add((char*)fn); + + dump_image((char*)fn, xmin, ymin, xmax, ymax); + } + + double h_scale = html_renderer->text_zoom_factor() * DEFAULT_DPI / param.h_dpi; + double v_scale = html_renderer->text_zoom_factor() * DEFAULT_DPI / param.v_dpi; + + auto & f_page = *(html_renderer->f_curpage); + auto & all_manager = html_renderer->all_manager; + + f_page << "<img class=\"" << CSS::BACKGROUND_IMAGE_CN + << " " << CSS::LEFT_CN << all_manager.left.install(((double)xmin) * h_scale) + << " " << CSS::BOTTOM_CN << all_manager.bottom.install(((double)getBitmapHeight() - 1 - ymax) * v_scale) + << " " << CSS::WIDTH_CN << all_manager.width.install(((double)(xmax - xmin + 1)) * h_scale) + << " " << CSS::HEIGHT_CN << all_manager.height.install(((double)(ymax - ymin + 1)) * v_scale) + << "\" alt=\"\" src=\""; + + if(param.embed_image) + { + auto path = html_renderer->str_fmt("%s/bg%x.%s", param.tmp_dir.c_str(), pageno, format.c_str()); + ifstream fin((char*)path, ifstream::binary); + if(!fin) + throw string("Cannot read background image ") + (char*)path; + + auto iter = FORMAT_MIME_TYPE_MAP.find(format); + if(iter == FORMAT_MIME_TYPE_MAP.end()) + throw string("Image format not supported: ") + format; + + string mime_type = iter->second; + f_page << "data:" << mime_type << ";base64," << Base64Stream(fin); + } + else + { + f_page << (char*)html_renderer->str_fmt("bg%x.%s", pageno, format.c_str()); + } + f_page << "\"/>"; + } +} + +// There might be mem leak when exception is thrown ! +void SplashBackgroundRenderer::dump_image(const char * filename, int x1, int y1, int x2, int y2) +{ + int width = x2 - x1 + 1; + int height = y2 - y1 + 1; + if((width <= 0) || (height <= 0)) + throw "Bad metric for background image"; + + FILE * f = fopen(filename, "wb"); + if(!f) + throw string("Cannot open file for background image " ) + filename; + + // use unique_ptr to auto delete the object upon exception + unique_ptr<ImgWriter> writer; + + if(false) { } +#ifdef ENABLE_LIBPNG + else if(format == "png") + { + writer = unique_ptr<ImgWriter>(new PNGWriter); + } +#endif +#ifdef ENABLE_LIBJPEG + else if(format == "jpg") + { + writer = unique_ptr<ImgWriter>(new JpegWriter); + } +#endif + else + { + throw string("Image format not supported: ") + format; + } + + if(!writer->init(f, width, height, param.h_dpi, param.v_dpi)) + throw "Cannot initialize image writer"; + + auto * bitmap = getBitmap(); + assert(bitmap->getMode() == splashModeRGB8); + + SplashColorPtr data = bitmap->getDataPtr(); + int row_size = bitmap->getRowSize(); + + vector<unsigned char*> pointers; + pointers.reserve(height); + SplashColorPtr p = data + y1 * row_size + x1 * 3; + for(int i = 0; i < height; ++i) + { + pointers.push_back(p); + p += row_size; + } + + if(!writer->writePointers(pointers.data(), height)) + { + throw "Cannot write background image"; + } + + if(!writer->close()) + { + throw "Cannot finish background image"; + } + + fclose(f); +} + +} // namespace pdf2htmlEX diff --git a/src/BackgroundRenderer/SplashBackgroundRenderer.h b/src/BackgroundRenderer/SplashBackgroundRenderer.h new file mode 100644 index 0000000..067de28 --- /dev/null +++ b/src/BackgroundRenderer/SplashBackgroundRenderer.h @@ -0,0 +1,65 @@ +/* + * Splash Background renderer + * Render all those things not supported as Image, with Splash + * + * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com> + */ + + +#ifndef SPLASH_BACKGROUND_RENDERER_H__ +#define SPLASH_BACKGROUND_RENDERER_H__ + +#include <string> + +#include <splash/SplashBitmap.h> +#include <SplashOutputDev.h> + +#include "pdf2htmlEX-config.h" + +#include "Param.h" +#include "HTMLRenderer/HTMLRenderer.h" + +namespace pdf2htmlEX { + +// Based on BackgroundRenderer from poppler +class SplashBackgroundRenderer : public BackgroundRenderer, SplashOutputDev +{ +public: + static const SplashColor white; + //format: "png" or "jpg", or "" for a default format + SplashBackgroundRenderer(const std::string & format, HTMLRenderer * html_renderer, const Param & param); + + virtual ~SplashBackgroundRenderer() { } + + virtual void init(PDFDoc * doc); + virtual bool render_page(PDFDoc * doc, int pageno); + virtual void embed_image(int pageno); + + // Does this device use beginType3Char/endType3Char? Otherwise, + // text in Type 3 fonts will be drawn with drawChar/drawString. + virtual GBool interpretType3Chars() { return !param.process_type3; } + + virtual void startPage(int pageNum, GfxState *state, XRef *xrefA); + + virtual void drawChar(GfxState *state, double x, double y, + double dx, double dy, + double originX, double originY, + CharCode code, int nBytes, Unicode *u, int uLen); + + //for proof + void beginTextObject(GfxState *state); + void beginString(GfxState *state, GooString * str); + void endTextObject(GfxState *state); + void updateRender(GfxState *state); + +protected: + void dump_image(const char * filename, int x1, int y1, int x2, int y2); + HTMLRenderer * html_renderer; + const Param & param; + std::string format; + int drawn_char_count; +}; + +} // namespace pdf2htmlEX + +#endif // SPLASH_BACKGROUND_RENDERER_H__ diff --git a/src/Base64Stream.cc b/src/Base64Stream.cc new file mode 100644 index 0000000..5d02aae --- /dev/null +++ b/src/Base64Stream.cc @@ -0,0 +1,42 @@ +#include "Base64Stream.h" + +namespace pdf2htmlEX { + +using std::ostream; + +ostream & Base64Stream::dumpto(ostream & out) +{ + unsigned char buf[3]; + while(in->read((char*)buf, 3)) + { + out << base64_encoding[(buf[0] & 0xfc)>>2] + << base64_encoding[((buf[0] & 0x03)<<4) | ((buf[1] & 0xf0)>>4)] + << base64_encoding[((buf[1] & 0x0f)<<2) | ((buf[2] & 0xc0)>>6)] + << base64_encoding[(buf[2] & 0x3f)]; + } + auto cnt = in->gcount(); + if(cnt > 0) + { + for(int i = cnt; i < 3; ++i) + buf[i] = 0; + + out << base64_encoding[(buf[0] & 0xfc)>>2] + << base64_encoding[((buf[0] & 0x03)<<4) | ((buf[1] & 0xf0)>>4)]; + + if(cnt > 1) + { + out << base64_encoding[(buf[1] & 0x0f)<<2]; + } + else + { + out << '='; + } + out << '='; + } + + return out; +} + +const char * Base64Stream::base64_encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + +} //namespace pdf2htmlEX diff --git a/src/Base64Stream.h b/src/Base64Stream.h new file mode 100644 index 0000000..759515f --- /dev/null +++ b/src/Base64Stream.h @@ -0,0 +1,34 @@ +/* + * Base64 Encoding + * + * by WangLu + * 2012.11.29 + */ + +#ifndef BASE64STREAM_H__ +#define BASE64STREAM_H__ + +#include <iostream> + +namespace pdf2htmlEX { + +class Base64Stream +{ +public: + Base64Stream(std::istream & in) : in(&in) { } + + std::ostream & dumpto(std::ostream & out); + +private: + std::istream * in; + static const char * base64_encoding; +}; + +inline +std::ostream & operator << (std::ostream & out, Base64Stream bs) +{ + return bs.dumpto(out); +} + +} //namespace pdf2htmlEX +#endif //BASE64STREAM_H__ diff --git a/src/Color.cc b/src/Color.cc new file mode 100644 index 0000000..6a344e5 --- /dev/null +++ b/src/Color.cc @@ -0,0 +1,51 @@ +#include <cmath> + +#include "Color.h" + +#include "util/misc.h" + +namespace pdf2htmlEX { + +using std::ostream; + +Color::Color() +{ + memset(this, 0, sizeof(Color)); +} + +Color::Color(double r, double g, double b, bool transparent) + :transparent(transparent) +{ + rgb.r = (GfxColorComp)(r * gfxColorComp1); + rgb.g = (GfxColorComp)(g * gfxColorComp1); + rgb.b = (GfxColorComp)(b * gfxColorComp1); +} + +Color::Color(const GfxRGB& rgb) + :transparent(false), rgb(rgb) { } + +ostream & operator << (ostream & out, const Color & color) +{ + if(color.transparent) + out << "transparent"; + else + out << color.rgb; + return out; +} + +void Color::get_gfx_color(GfxColor & gc) const +{ + gc.c[0] = rgb.r; + gc.c[1] = rgb.g; + gc.c[2] = rgb.b; +} + +double Color::distance(const Color & other) const +{ + double dr = (double)rgb.r - other.rgb.r, + dg = (double)rgb.g - other.rgb.g, + db = (double)rgb.b - other.rgb.b; + return sqrt((dr * dr + dg * dg + db * db) / (3.0 * gfxColorComp1 * gfxColorComp1)); +} + +} // namespace pdf2htmlEX diff --git a/src/Color.h b/src/Color.h new file mode 100644 index 0000000..a2d2415 --- /dev/null +++ b/src/Color.h @@ -0,0 +1,38 @@ +/* + * Header file for Color + * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com> + */ + +#ifndef COLOR_H__ +#define COLOR_H__ + +#include <ostream> + +#include <GfxState.h> + +namespace pdf2htmlEX { + +struct Color +{ + bool transparent; + GfxRGB rgb; + Color(); + Color(double r, double g, double b, bool transparent = false); + Color(const GfxRGB& rgb); + bool operator == (const Color & c) const { + if(transparent != c.transparent) + return false; + if(transparent) + return true; + return ((rgb.r == c.rgb.r) && (rgb.g == c.rgb.g) && (rgb.b == c.rgb.b)); + } + void get_gfx_color(GfxColor & gc) const; + // Color distance, [0,1]. + double distance(const Color & other) const; +}; + +std::ostream & operator << (std::ostream & out, const Color & color); + +} // namespace pdf2htmlEX + +#endif // COLOR_H__ diff --git a/src/CoveredTextDetector.cc b/src/CoveredTextDetector.cc new file mode 100644 index 0000000..e109b3f --- /dev/null +++ b/src/CoveredTextDetector.cc @@ -0,0 +1,51 @@ +/* + * CoveredTextDetector.cc + * + * Created on: 2014-6-14 + * Author: duanyao + */ + +#include "CoveredTextDetector.h" + +#include "util/math.h" + +namespace pdf2htmlEX { + +void CoveredTextDetector::reset() +{ + char_bboxes.clear(); + chars_covered.clear(); +} + +void CoveredTextDetector::add_char_bbox(double * bbox) +{ + char_bboxes.insert(char_bboxes.end(), bbox, bbox + 4); + chars_covered.push_back(false); +} + +void CoveredTextDetector::add_char_bbox_clipped(double * bbox, bool patially) +{ + char_bboxes.insert(char_bboxes.end(), bbox, bbox + 4); + chars_covered.push_back(true); + if (patially) + add_non_char_bbox(bbox, chars_covered.size() - 1); +} + +void CoveredTextDetector::add_non_char_bbox(double * bbox, int index) +{ + if (index < 0) + index = chars_covered.size(); + for (int i = 0; i < index; i++) + { + if (chars_covered[i]) + continue; + double * cbbox = &char_bboxes[i * 4]; + if (bbox_intersect(cbbox, bbox)) + { + chars_covered[i] = true; + add_non_char_bbox(cbbox, i); + } + } +} + +} diff --git a/src/CoveredTextDetector.h b/src/CoveredTextDetector.h new file mode 100644 index 0000000..bee6c17 --- /dev/null +++ b/src/CoveredTextDetector.h @@ -0,0 +1,61 @@ +/* + * CoveredTextDetector.h + * + * Created on: 2014-6-14 + * Author: duanyao + */ + +#ifndef COVEREDTEXTDETECTOR_H__ +#define COVEREDTEXTDETECTOR_H__ + +#include <vector> + +namespace pdf2htmlEX { + +/** + * Detect characters that are covered by non-char graphics on a page. + */ +class CoveredTextDetector +{ +public: + + /** + * Reset to initial state. Should be called when start drawing a page. + */ + void reset(); + + /** + * Add a drawn character's bounding box. + * @param bbox (x0, y0, x1, y1) + */ + void add_char_bbox(double * bbox); + + void add_char_bbox_clipped(double * bbox, bool patially); + + /** + * Add a drawn non-char graphics' bounding box. + * If it intersects any previously drawn char's bbox, the char is marked as covered + * and treated as an non-char. + * @param bbox (x0, y0, x1, y1) + * @param index this graphics' drawing order: assume it is drawn after (index-1)th + * char. -1 means after the last char. + */ + void add_non_char_bbox(double * bbox, int index = -1); + + /** + * An array of flags indicating whether a char is covered by any non-char graphics. + * Index by the order that these chars are added. + * This vector grows as add_char_bbox() is called, so its size is the count + * of currently drawn chars. + */ + const std::vector<bool> & get_chars_covered() { return chars_covered; } + +private: + std::vector<bool> chars_covered; + // x00, y00, x01, y01; x10, y10, x11, y11;... + std::vector<double> char_bboxes; +}; + +} + +#endif /* COVEREDTEXTDETECTOR_H__ */ diff --git a/src/DrawingTracer.cc b/src/DrawingTracer.cc new file mode 100644 index 0000000..ffabad0 --- /dev/null +++ b/src/DrawingTracer.cc @@ -0,0 +1,400 @@ +/* + * DrawingTracer.cc + * + * Created on: 2014-6-15 + * Author: duanyao + */ + +#include "GfxFont.h" + +#include "util/math.h" +#include "DrawingTracer.h" + +#if !ENABLE_SVG +#warning "Cairo is disabled because ENABLE_SVG is off, --correct-text-visibility has limited functionality." +#endif + +static constexpr bool DT_DEBUG = false; + +namespace pdf2htmlEX +{ + +DrawingTracer::DrawingTracer(const Param & param): param(param) +#if ENABLE_SVG +, cairo(nullptr) +#endif +{ +} + +DrawingTracer::~DrawingTracer() +{ + finish(); +} + +void DrawingTracer::reset(GfxState *state) +{ + if (!param.correct_text_visibility) + return; + finish(); + +#if ENABLE_SVG + // pbox is defined in device space, which is affected by zooming; + // We want to trace in page space which is stable, so invert pbox by ctm. + double pbox[] { 0, 0, state->getPageWidth(), state->getPageHeight() }; + Matrix ctm, ictm; + state->getCTM(&ctm); + ctm.invertTo(&ictm); + tm_transform_bbox(ictm.m, pbox); + cairo_rectangle_t page_box { pbox[0], pbox[1], pbox[2] - pbox[0], pbox[3] - pbox[1] }; + cairo_surface_t * surface = cairo_recording_surface_create(CAIRO_CONTENT_COLOR_ALPHA, &page_box); + cairo = cairo_create(surface); + if (DT_DEBUG) + printf("DrawingTracer::reset:page bbox:[%f,%f,%f,%f]\n",pbox[0], pbox[1], pbox[2], pbox[3]); +#endif +} + +void DrawingTracer::finish() +{ +#if ENABLE_SVG + if (cairo) + { + cairo_destroy(cairo); + cairo = nullptr; + } +#endif +} + +// Poppler won't inform us its initial CTM, and the initial CTM is affected by zoom level. +// OutputDev::clip() may be called before OutputDev::updateCTM(), so we can't rely on GfxState::getCTM(), +// and should trace ctm changes ourself (via cairo). +void DrawingTracer::update_ctm(GfxState *state, double m11, double m12, double m21, double m22, double m31, double m32) +{ + if (!param.correct_text_visibility) + return; + +#if ENABLE_SVG + cairo_matrix_t matrix; + matrix.xx = m11; + matrix.yx = m12; + matrix.xy = m21; + matrix.yy = m22; + matrix.x0 = m31; + matrix.y0 = m32; + cairo_transform(cairo, &matrix); + + if (DT_DEBUG) + { + cairo_matrix_t mat; + cairo_get_matrix(cairo, &mat); + printf("DrawingTracer::update_ctm:ctm:[%f,%f,%f,%f,%f,%f]\n", mat.xx, mat.yx, mat.xy, mat.yy, mat.x0, mat.y0); + } +#endif +} + +void DrawingTracer::clip(GfxState * state, bool even_odd) +{ + if (!param.correct_text_visibility) + return; +#if ENABLE_SVG + do_path(state, state->getPath()); + cairo_set_fill_rule(cairo, even_odd? CAIRO_FILL_RULE_EVEN_ODD : CAIRO_FILL_RULE_WINDING); + cairo_clip (cairo); + + if (DT_DEBUG) + { + double cbox[4]; + cairo_clip_extents(cairo, cbox, cbox + 1, cbox + 2, cbox + 3); + printf("DrawingTracer::clip:extents:[%f,%f,%f,%f]\n", cbox[0],cbox[1],cbox[2],cbox[3]); + } +#endif +} + +void DrawingTracer::clip_to_stroke_path(GfxState * state) +{ + if (!param.correct_text_visibility) + return; + // TODO cairo_stroke_to_path() ? +} + +void DrawingTracer::save() +{ + if (!param.correct_text_visibility) + return; +#if ENABLE_SVG + cairo_save(cairo); + if (DT_DEBUG) + printf("DrawingTracer::save\n"); +#endif +} +void DrawingTracer::restore() +{ + if (!param.correct_text_visibility) + return; +#if ENABLE_SVG + cairo_restore(cairo); + if (DT_DEBUG) + printf("DrawingTracer::restore\n"); +#endif +} + +void DrawingTracer::do_path(GfxState * state, GfxPath * path) +{ +#if ENABLE_SVG + //copy from CairoOutputDev::doPath + GfxSubpath *subpath; + int i, j; + double x, y; + cairo_new_path(cairo); + if (DT_DEBUG) + printf("DrawingTracer::do_path:new_path\n"); + for (i = 0; i < path->getNumSubpaths(); ++i) { + subpath = path->getSubpath(i); + if (subpath->getNumPoints() > 0) { + x = subpath->getX(0); + y = subpath->getY(0); + cairo_move_to(cairo, x, y); + if (DT_DEBUG) + printf("DrawingTracer::do_path:move_to[%f,%f]\n",x,y); + j = 1; + while (j < subpath->getNumPoints()) { + if (subpath->getCurve(j)) { + x = subpath->getX(j+2); + y = subpath->getY(j+2); + cairo_curve_to(cairo, + subpath->getX(j), subpath->getY(j), + subpath->getX(j+1), subpath->getY(j+1), + x, y); + if (DT_DEBUG) + printf("DrawingTracer::do_path:curve_to[%f,%f]\n",x,y); + j += 3; + } else { + x = subpath->getX(j); + y = subpath->getY(j); + cairo_line_to(cairo, x, y); + if (DT_DEBUG) + printf("DrawingTracer::do_path:line_to[%f,%f]\n",x,y); + ++j; + } + } + if (subpath->isClosed()) { + cairo_close_path (cairo); + if (DT_DEBUG) + printf("DrawingTracer::do_path:close\n"); + } + } + } +#endif +} + +void DrawingTracer::stroke(GfxState * state) +{ +#if ENABLE_SVG + if (!param.correct_text_visibility) + return; + + if (DT_DEBUG) + printf("DrawingTracer::stroke\n"); + + cairo_set_line_width(cairo, state->getLineWidth()); + + // GfxPath is broken into steps, each step makes up a cairo path and its bbox is used for covering test. + // TODO + // 1. path steps that are not vertical or horizontal lines may still falsely "cover" many chars, + // can we slice those steps further? + // 2. if the line width is small, can we just ignore the path? + // 3. line join feature can't be retained. We use line-cap-square to minimize the problem that + // some chars actually covered by a line join are missed. However chars covered by a acute angle + // with line-join-miter may be still recognized as not covered. + cairo_set_line_cap(cairo, CAIRO_LINE_CAP_SQUARE); + GfxPath * path = state->getPath(); + for (int i = 0; i < path->getNumSubpaths(); ++i) { + GfxSubpath * subpath = path->getSubpath(i); + if (subpath->getNumPoints() <= 0) + continue; + double x = subpath->getX(0); + double y = subpath->getY(0); + //p: loop cursor; j: next point index + int p =1, j = 1; + int n = subpath->getNumPoints(); + while (p <= n) { + cairo_new_path(cairo); + cairo_move_to(cairo, x, y); + if (subpath->getCurve(j)) { + x = subpath->getX(j+2); + y = subpath->getY(j+2); + cairo_curve_to(cairo, + subpath->getX(j), subpath->getY(j), + subpath->getX(j+1), subpath->getY(j+1), + x, y); + p += 3; + } else { + x = subpath->getX(j); + y = subpath->getY(j); + cairo_line_to(cairo, x, y); + ++p; + } + + if (DT_DEBUG) + printf("DrawingTracer::stroke:new box:\n"); + double sbox[4]; + cairo_stroke_extents(cairo, sbox, sbox + 1, sbox + 2, sbox + 3); + if (sbox[0] != sbox[2] && sbox[1] != sbox[3]) + draw_non_char_bbox(state, sbox); + else if (DT_DEBUG) + printf("DrawingTracer::stroke:zero box!\n"); + + if (p == n) + { + if (subpath->isClosed()) + j = 0; // if sub path is closed, go back to starting point + else + break; + } + else + j = p; + } + } +#endif +} + +void DrawingTracer::fill(GfxState * state, bool even_odd) +{ + if (!param.correct_text_visibility) + return; + +#if ENABLE_SVG + do_path(state, state->getPath()); + //cairo_fill_extents don't take fill rule into account. + //cairo_set_fill_rule (cairo, even_odd? CAIRO_FILL_RULE_EVEN_ODD : CAIRO_FILL_RULE_WINDING); + double fbox[4]; + cairo_fill_extents(cairo, fbox, fbox + 1, fbox + 2, fbox + 3); + draw_non_char_bbox(state, fbox); +#endif +} + +void DrawingTracer::draw_non_char_bbox(GfxState * state, double * bbox) +{ +#if ENABLE_SVG + double cbox[4]; + cairo_clip_extents(cairo, cbox, cbox + 1, cbox + 2, cbox + 3); + if(bbox_intersect(cbox, bbox, bbox)) +#endif + { + transform_bbox_by_ctm(bbox, state); + if (DT_DEBUG) + printf("DrawingTracer::draw_non_char_bbox:[%f,%f,%f,%f]\n", bbox[0],bbox[1],bbox[2],bbox[3]); + if (on_non_char_drawn) + on_non_char_drawn(bbox); + } +} + +void DrawingTracer::draw_char_bbox(GfxState * state, double * bbox) +{ +#if ENABLE_SVG + // Note: even if 4 corners of the char are all in or all out of the clip area, + // it could still be partially clipped. + // TODO better solution? + int pt_in = 0; + if (cairo_in_clip(cairo, bbox[0], bbox[1])) + ++pt_in; + if (cairo_in_clip(cairo, bbox[2], bbox[3])) + ++pt_in; + if (cairo_in_clip(cairo, bbox[2], bbox[1])) + ++pt_in; + if (cairo_in_clip(cairo, bbox[0], bbox[3])) + ++pt_in; + + if (pt_in == 0) + { + transform_bbox_by_ctm(bbox); + if(on_char_clipped) + on_char_clipped(bbox, false); + } + else + { + if (pt_in < 4) + { + double cbox[4]; + cairo_clip_extents(cairo, cbox, cbox + 1, cbox + 2, cbox + 3); + bbox_intersect(cbox, bbox, bbox); + } + transform_bbox_by_ctm(bbox); + if (pt_in < 4) + { + if(on_char_clipped) + on_char_clipped(bbox, true); + } + else + { + if (on_char_drawn) + on_char_drawn(bbox); + } + } +#else + transform_bbox_by_ctm(bbox, state); + if (on_char_drawn) + on_char_drawn(bbox); +#endif + if (DT_DEBUG) + printf("DrawingTracer::draw_char_bbox:[%f,%f,%f,%f]\n",bbox[0],bbox[1],bbox[2],bbox[3]); +} + +void DrawingTracer::draw_image(GfxState *state) +{ + if (!param.correct_text_visibility) + return; + double bbox[4] {0, 0, 1, 1}; + draw_non_char_bbox(state, bbox); +} + +void DrawingTracer::draw_char(GfxState *state, double x, double y, double ax, double ay) +{ + if (!param.correct_text_visibility) + return; + + Matrix tm, itm; + memcpy(tm.m, state->getTextMat(), sizeof(tm.m)); + + double cx = state->getCurX(), cy = state->getCurY(), fs = state->getFontSize(), + ry = state->getRise(), h = state->getHorizScaling(); + + //cx and cy has been transformed by text matrix, we need to reverse them. + tm.invertTo(&itm); + double char_cx, char_cy; + itm.transform(cx, cy, &char_cx, &char_cy); + + //TODO Vertical? Currently vertical/type3 chars are treated as non-chars. + double char_m[6] {fs * h, 0, 0, fs, char_cx + x, char_cy + y + ry}; + + double final_m[6]; + tm_multiply(final_m, tm.m, char_m); + + auto font = state->getFont(); + double bbox[4] {0, 0, ax, ay}; + double desc = font->getDescent(), asc = font->getAscent(); + if (font->getWMode() == 0) + { + bbox[1] += desc; + bbox[3] += asc; + } + else + {//TODO Vertical? + } + tm_transform_bbox(final_m, bbox); + draw_char_bbox(state, bbox); +} + + +void DrawingTracer::transform_bbox_by_ctm(double * bbox, GfxState * state) +{ +#if ENABLE_SVG + cairo_matrix_t mat; + cairo_get_matrix(cairo, &mat); + double mat_a[6] {mat.xx, mat.yx, mat.xy, mat.yy, mat.x0, mat.y0}; + tm_transform_bbox(mat_a, bbox); +#else + tm_transform_bbox(state->getCTM(), bbox); +#endif +} + +} /* namespace pdf2htmlEX */ diff --git a/src/DrawingTracer.h b/src/DrawingTracer.h new file mode 100644 index 0000000..2e3159d --- /dev/null +++ b/src/DrawingTracer.h @@ -0,0 +1,79 @@ +/* + * DrawingTracer.h + * + * Created on: 2014-6-15 + * Author: duanyao + */ + +#ifndef DRAWINGTRACER_H__ +#define DRAWINGTRACER_H__ + +#include <functional> + +#include <GfxState.h> + +#include "pdf2htmlEX-config.h" + +#if ENABLE_SVG +#include <cairo.h> +#endif + +#include "Param.h" + +namespace pdf2htmlEX +{ + +class DrawingTracer +{ +public: + /* + * The callback to receive drawn event. + * bbox in device space. + */ + // a non-char graphics is drawn + std::function<void(double * bbox)> on_non_char_drawn; + // a char is drawn in the clip area + std::function<void(double * bbox)> on_char_drawn; + // a char is drawn out of/partially in the clip area + std::function<void(double * bbox, bool patially)> on_char_clipped; + + DrawingTracer(const Param & param); + virtual ~DrawingTracer(); + void reset(GfxState * state); + + /* + * A character is drawing + * x, y: glyph-drawing position, in PDF text object space. + * ax, ay: glyph advance, in glyph space. + */ + void draw_char(GfxState * state, double x, double y, double ax, double ay); + /* + * An image is drawing + */ + void draw_image(GfxState * state); + void update_ctm(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32); + void clip(GfxState * state, bool even_odd = false); + void clip_to_stroke_path(GfxState * state); + void fill(GfxState * state, bool even_odd = false); + void stroke(GfxState * state); + void save(); + void restore(); + +private: + void finish(); + // Following methods operate in user space (just before CTM is applied) + void do_path(GfxState * state, GfxPath * path); + void draw_non_char_bbox(GfxState * state, double * bbox); + void draw_char_bbox(GfxState * state, double * bbox); + // If cairo is available, parameter state is ignored + void transform_bbox_by_ctm(double * bbox, GfxState * state = nullptr); + + const Param & param; + +#if ENABLE_SVG + cairo_t * cairo; +#endif +}; + +} /* namespace pdf2htmlEX */ +#endif /* DRAWINGTRACER_H__ */ diff --git a/src/HTMLRenderer/HTMLRenderer.h b/src/HTMLRenderer/HTMLRenderer.h new file mode 100644 index 0000000..18e395d --- /dev/null +++ b/src/HTMLRenderer/HTMLRenderer.h @@ -0,0 +1,348 @@ +/* + * HTMLRenderer.h + * + * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com> + */ + +#ifndef HTMLRENDERER_H_ +#define HTMLRENDERER_H_ + +#include <unordered_map> +#include <cstdint> +#include <fstream> +#include <memory> + +#include <OutputDev.h> +#include <GfxState.h> +#include <Stream.h> +#include <PDFDoc.h> +#include <goo/gtypes.h> +#include <Object.h> +#include <GfxFont.h> +#include <Annot.h> + +// for form.cc +#include <Page.h> +#include <Form.h> + +#include "pdf2htmlEX-config.h" + +#include "Param.h" +#include "Preprocessor.h" +#include "StringFormatter.h" +#include "TmpFiles.h" +#include "Color.h" +#include "StateManager.h" +#include "HTMLTextPage.h" + +#include "BackgroundRenderer/BackgroundRenderer.h" +#include "CoveredTextDetector.h" +#include "DrawingTracer.h" + +#include "util/const.h" +#include "util/misc.h" + + +namespace pdf2htmlEX { + +struct HTMLRenderer : OutputDev +{ + HTMLRenderer(const Param & param); + virtual ~HTMLRenderer(); + + void process(PDFDoc * doc); + + //////////////////////////////////////////////////// + // OutputDev interface + //////////////////////////////////////////////////// + + // Does this device use upside-down coordinates? + // (Upside-down means (0,0) is the top left corner of the page.) + virtual GBool upsideDown() { return gFalse; } + + // Does this device use drawChar() or drawString()? + virtual GBool useDrawChar() { return gFalse; } + + // Does this device use functionShadedFill(), axialShadedFill(), and + // radialShadedFill()? If this returns false, these shaded fills + // will be reduced to a series of other drawing operations. + virtual GBool useShadedFills(int type) { return (type == 2) ? gTrue: gFalse; } + + // Does this device use beginType3Char/endType3Char? Otherwise, + // text in Type 3 fonts will be drawn with drawChar/drawString. + virtual GBool interpretType3Chars() { return gFalse; } + + // Does this device need non-text content? + virtual GBool needNonText() { return (param.process_nontext) ? gTrue: gFalse; } + + // Does this device need to clip pages to the crop box even when the + // box is the crop box? + virtual GBool needClipToCropBox() { return gTrue; } + + virtual void setDefaultCTM(double *ctm); + + // Start a page. + virtual void startPage(int pageNum, GfxState *state, XRef * xref); + + // End a page. + virtual void endPage(); + + /* + * To optimize false alarms + * We just mark as changed, and recheck if they have been changed when we are about to output a new string + */ + + virtual void restoreState(GfxState * state); + + virtual void saveState(GfxState *state); + + virtual void updateAll(GfxState * state); + + virtual void updateRise(GfxState * state); + virtual void updateTextPos(GfxState * state); + virtual void updateTextShift(GfxState * state, double shift); + + virtual void updateFont(GfxState * state); + virtual void updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32); + virtual void updateTextMat(GfxState * state); + virtual void updateHorizScaling(GfxState * state); + + virtual void updateCharSpace(GfxState * state); + virtual void updateWordSpace(GfxState * state); + + virtual void updateRender(GfxState * state); + + virtual void updateFillColorSpace(GfxState * state); + virtual void updateStrokeColorSpace(GfxState * state); + virtual void updateFillColor(GfxState * state); + virtual void updateStrokeColor(GfxState * state); + + + /* + * Rendering + */ + + virtual void clip(GfxState * state); + virtual void eoClip(GfxState * state); + virtual void clipToStrokePath(GfxState * state); + + virtual void drawString(GfxState * state, GooString * s); + + virtual void drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg); + + virtual void drawSoftMaskedImage(GfxState *state, Object *ref, Stream *str, + int width, int height, + GfxImageColorMap *colorMap, + GBool interpolate, + Stream *maskStr, + int maskWidth, int maskHeight, + GfxImageColorMap *maskColorMap, + GBool maskInterpolate); + + virtual void stroke(GfxState *state); + virtual void fill(GfxState *state); + virtual void eoFill(GfxState *state); + virtual GBool axialShadedFill(GfxState *state, GfxAxialShading *shading, double tMin, double tMax); + + virtual void processLink(AnnotLink * al); + + /* + * Covered text handling. + */ + // Is a char (actually a glyph) covered by non-char's. Index in drawing order in current page. + // Does not fail on out-of-bound conditions, but return false. + bool is_char_covered(int index); + // Currently drawn char (glyph) count in current page. + int get_char_count() { return (int)covered_text_detector.get_chars_covered().size(); } + +protected: + //////////////////////////////////////////////////// + // misc + //////////////////////////////////////////////////// + void pre_process(PDFDoc * doc); + void post_process(void); + + void process_outline(void); + void process_outline_items(GooList * items); + + void process_form(std::ofstream & out); + + void set_stream_flags (std::ostream & out); + + void dump_css(void); + + // convert a LinkAction to a string that our Javascript code can understand + std::string get_linkaction_str(LinkAction *, std::string & detail); + + //////////////////////////////////////////////////// + /* + * manage fonts + * + * In PDF: (install_*) + * embedded font: fonts embedded in PDF + * external font: fonts that have only names provided in PDF, the viewer should find a local font to match with + * + * In HTML: (export_*) + * remote font: to be retrieved from the web server + * remote default font: fallback styles for invalid fonts + * local font: to be substituted with a local (client side) font + */ + //////////////////////////////////////////////////// + std::string dump_embedded_font(GfxFont * font, FontInfo & info); + std::string dump_type3_font(GfxFont * font, FontInfo & info); + void embed_font(const std::string & filepath, GfxFont * font, FontInfo & info, bool get_metric_only = false); + const FontInfo * install_font(GfxFont * font); + void install_embedded_font(GfxFont * font, FontInfo & info); + void install_external_font (GfxFont * font, FontInfo & info); + void export_remote_font(const FontInfo & info, const std::string & suffix, GfxFont * font); + void export_remote_default_font(long long fn_id); + void export_local_font(const FontInfo & info, GfxFont * font, const std::string & original_font_name, const std::string & cssfont); + + // depending on --embed***, to embed the content or add a link to it + // "type": specify the file type, usually it's the suffix, in which case this parameter could be "" + // "copy": indicates whether to copy the file into dest_dir, if not embedded + void embed_file(std::ostream & out, const std::string & path, const std::string & type, bool copy); + + //////////////////////////////////////////////////// + // state tracking + //////////////////////////////////////////////////// + // reset all states + void reset_state(); + // reset all ***_changed flags + void reset_state_change(); + // check updated states, and determine new_line_status + // make sure this function can be called several times consecutively without problem + void check_state_change(GfxState * state); + // prepare the line context, (close old tags, open new tags) + // make sure the current HTML style consistent with PDF + void prepare_text_line(GfxState * state); + + //////////////////////////////////////////////////// + // PDF stuffs + //////////////////////////////////////////////////// + + XRef * xref; + PDFDoc * cur_doc; + Catalog * cur_catalog; + int pageNum; + + double default_ctm[6]; + + /* + * The content of each page is first scaled with factor1 (>=1), then scale back with factor2(<=1) + * + * factor1 is use to multiplied with all metrics (height/width/font-size...), in order to improve accuracy + * factor2 is applied with css transform, and is exposed to Javascript + * + * factor1 & factor 2 are determined according to zoom and font-size-multiplier + * + */ + double text_zoom_factor (void) const { return text_scale_factor1 * text_scale_factor2; } + double text_scale_factor1; + double text_scale_factor2; + + // 1px on screen should be printed as print_scale()pt + double print_scale (void) const { return 96.0 / DEFAULT_DPI / text_zoom_factor(); } + + + const Param & param; + + //////////////////////////////////////////////////// + // PDF states + //////////////////////////////////////////////////// + // track the original (unscaled) values to determine scaling and merge lines + // current position + double cur_tx, cur_ty; // real text position, in text coords + double cur_font_size; + // this is CTM * TextMAT in PDF + // as we'll calculate the position of the origin separately + double cur_text_tm[6]; // unscaled + + bool all_changed; + bool ctm_changed; + bool rise_changed; + bool font_changed; + bool text_pos_changed; + bool text_mat_changed; + bool fill_color_changed; + bool hori_scale_changed; + bool word_space_changed; + bool letter_space_changed; + bool stroke_color_changed; + bool clip_changed; + + //////////////////////////////////////////////////// + // HTML states + //////////////////////////////////////////////////// + + // optimize for web + // we try to render the final font size directly + // to reduce the effect of ctm as much as possible + + // the actual tm used is `real tm in PDF` scaled by 1/draw_text_scale, + // so everything rendered should be multiplied by draw_text_scale + double draw_text_scale; + + // the position of next char, in text coords + // this is actual position (in HTML), which might be different from cur_tx/ty (in PDF) + // also keep in mind that they are not the final position, as they will be transform by CTM (also true for cur_tx/ty) + double draw_tx, draw_ty; + + + //////////////////////////////////////////////////// + // styles & resources + //////////////////////////////////////////////////// + // managers store values actually used in HTML (i.e. scaled) + std::unordered_map<long long, FontInfo> font_info_map; + AllStateManager all_manager; + HTMLTextState cur_text_state; + HTMLLineState cur_line_state; + HTMLClipState cur_clip_state; + + HTMLTextPage html_text_page; + + enum NewLineState + { + NLS_NONE, + NLS_NEWSTATE, + NLS_NEWLINE, + NLS_NEWCLIP + } new_line_state; + + // for font reencoding + std::vector<int32_t> cur_mapping; + std::vector<char*> cur_mapping2; + std::vector<int> width_list; // width of each char + + Preprocessor preprocessor; + + // manage temporary files + TmpFiles tmp_files; + + // for string formatting + StringFormatter str_fmt; + + // render background image + friend class SplashBackgroundRenderer; // ugly! +#if ENABLE_SVG + friend class CairoBackgroundRenderer; // ugly! +#endif + + std::unique_ptr<BackgroundRenderer> bg_renderer, fallback_bg_renderer; + + struct { + std::ofstream fs; + std::string path; + } f_outline, f_pages, f_css; + std::ofstream * f_curpage; + std::string cur_page_filename; + + static const std::string MANIFEST_FILENAME; + + CoveredTextDetector covered_text_detector; + DrawingTracer tracer; +}; + +} //namespace pdf2htmlEX + +#endif /* HTMLRENDERER_H_ */ diff --git a/src/HTMLRenderer/draw.cc b/src/HTMLRenderer/draw.cc new file mode 100644 index 0000000..6529418 --- /dev/null +++ b/src/HTMLRenderer/draw.cc @@ -0,0 +1,65 @@ +/* + * Draw.cc + * + * Handling path drawing + * + * by WangLu + * 2012.10.01 + */ + +#include <algorithm> +#include <cmath> +#include <sstream> +#include <vector> +#include <iostream> + +#include "HTMLRenderer.h" +#include "util/misc.h" +#include "util/math.h" +#include "util/namespace.h" + +namespace pdf2htmlEX { + +using std::swap; +using std::min; +using std::max; +using std::acos; +using std::asin; +using std::ostringstream; +using std::sqrt; +using std::vector; +using std::ostream; + +void HTMLRenderer::restoreState(GfxState * state) +{ + updateAll(state); + tracer.restore(); +} + +void HTMLRenderer::saveState(GfxState *state) +{ + tracer.save(); +} + +void HTMLRenderer::stroke(GfxState * state) +{ + tracer.stroke(state); +} + +void HTMLRenderer::fill(GfxState * state) +{ + tracer.fill(state); +} + +void HTMLRenderer::eoFill(GfxState * state) +{ + tracer.fill(state, true); +} + +GBool HTMLRenderer::axialShadedFill(GfxState *state, GfxAxialShading *shading, double tMin, double tMax) +{ + tracer.fill(state); //TODO correct? + return true; +} + +} // namespace pdf2htmlEX diff --git a/src/HTMLRenderer/font.cc b/src/HTMLRenderer/font.cc new file mode 100644 index 0000000..10ff215 --- /dev/null +++ b/src/HTMLRenderer/font.cc @@ -0,0 +1,1089 @@ +/* + * font.cc + * + * Font processing + * + * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com> + */ + +#include <iostream> +#include <cmath> +#include <algorithm> +#include <sstream> +#include <cctype> +#include <unordered_set> + +#include <GlobalParams.h> +#include <fofi/FoFiTrueType.h> +#include <CharCodeToUnicode.h> + +#include "Param.h" +#include "HTMLRenderer.h" +#include "Base64Stream.h" + +#include "pdf2htmlEX-config.h" + +#include "util/namespace.h" +#include "util/math.h" +#include "util/misc.h" +#include "util/ffw.h" +#include "util/path.h" +#include "util/unicode.h" +#include "util/css_const.h" + +#if ENABLE_SVG +#include <cairo.h> +#include <cairo-ft.h> +#include <cairo-svg.h> +#include "CairoFontEngine.h" +#include "CairoOutputDev.h" +#include <Gfx.h> +#endif + +namespace pdf2htmlEX { + +using std::min; +using std::unordered_set; +using std::cerr; +using std::endl; + +string HTMLRenderer::dump_embedded_font (GfxFont * font, FontInfo & info) +{ + if(info.is_type3) + return dump_type3_font(font, info); + + Object obj, obj1, obj2; + Object font_obj, font_obj2, fontdesc_obj; + string suffix; + string filepath; + + long long fn_id = info.id; + + try + { + // inspired by mupdf + string subtype; + + auto * id = font->getID(); + + Object ref_obj; + ref_obj.initRef(id->num, id->gen); + ref_obj.fetch(xref, &font_obj); + ref_obj.free(); + + if(!font_obj.isDict()) + { + cerr << "Font object is not a dictionary" << endl; + throw 0; + } + + Dict * dict = font_obj.getDict(); + if(dict->lookup("DescendantFonts", &font_obj2)->isArray()) + { + if(font_obj2.arrayGetLength() == 0) + { + cerr << "Warning: empty DescendantFonts array" << endl; + } + else + { + if(font_obj2.arrayGetLength() > 1) + cerr << "TODO: multiple entries in DescendantFonts array" << endl; + + if(font_obj2.arrayGet(0, &obj2)->isDict()) + { + dict = obj2.getDict(); + } + } + } + + if(!dict->lookup("FontDescriptor", &fontdesc_obj)->isDict()) + { + cerr << "Cannot find FontDescriptor " << endl; + throw 0; + } + + dict = fontdesc_obj.getDict(); + + if(dict->lookup("FontFile3", &obj)->isStream()) + { + if(obj.streamGetDict()->lookup("Subtype", &obj1)->isName()) + { + subtype = obj1.getName(); + if(subtype == "Type1C") + { + suffix = ".cff"; + } + else if (subtype == "CIDFontType0C") + { + suffix = ".cid"; + } + else if (subtype == "OpenType") + { + suffix = ".otf"; + } + else + { + cerr << "Unknown subtype: " << subtype << endl; + throw 0; + } + } + else + { + cerr << "Invalid subtype in font descriptor" << endl; + throw 0; + } + } + else if (dict->lookup("FontFile2", &obj)->isStream()) + { + suffix = ".ttf"; + } + else if (dict->lookup("FontFile", &obj)->isStream()) + { + suffix = ".pfa"; + } + else + { + cerr << "Cannot find FontFile for dump" << endl; + throw 0; + } + + if(suffix == "") + { + cerr << "Font type unrecognized" << endl; + throw 0; + } + + obj.streamReset(); + + filepath = (char*)str_fmt("%s/f%llx%s", param.tmp_dir.c_str(), fn_id, suffix.c_str()); + tmp_files.add(filepath); + + ofstream outf(filepath, ofstream::binary); + if(!outf) + throw string("Cannot open file ") + filepath + " for writing"; + + char buf[1024]; + int len; + while((len = obj.streamGetChars(1024, (Guchar*)buf)) > 0) + { + outf.write(buf, len); + } + obj.streamClose(); + } + catch(int) + { + cerr << "Something wrong when trying to dump font " << hex << fn_id << dec << endl; + } + + obj2.free(); + obj1.free(); + obj.free(); + + fontdesc_obj.free(); + font_obj2.free(); + font_obj.free(); + + return filepath; +} + +string HTMLRenderer::dump_type3_font (GfxFont * font, FontInfo & info) +{ + assert(info.is_type3); + +#if ENABLE_SVG + long long fn_id = info.id; + + FT_Library ft_lib; + FT_Init_FreeType(&ft_lib); + CairoFontEngine font_engine(ft_lib); + auto * cur_font = font_engine.getFont(font, cur_doc, true, xref); + auto used_map = preprocessor.get_code_map(hash_ref(font->getID())); + + //calculate transformed metrics + double * font_bbox = font->getFontBBox(); + double * font_matrix = font->getFontMatrix(); + double transformed_bbox[4]; + memcpy(transformed_bbox, font_bbox, 4 * sizeof(double)); + /* + // add the origin to the bbox + if(transformed_bbox[0] > 0) transformed_bbox[0] = 0; + if(transformed_bbox[1] > 0) transformed_bbox[1] = 0; + if(transformed_bbox[2] < 0) transformed_bbox[2] = 0; + if(transformed_bbox[3] < 0) transformed_bbox[3] = 0; + */ + tm_transform_bbox(font_matrix, transformed_bbox); + double transformed_bbox_width = transformed_bbox[2] - transformed_bbox[0]; + double transformed_bbox_height = transformed_bbox[3] - transformed_bbox[1]; + info.font_size_scale = std::max(transformed_bbox_width, transformed_bbox_height); + + // we want the glyphs is rendered in a box of size around GLYPH_DUMP_EM_SIZE x GLYPH_DUMP_EM_SIZE + // for rectangles, the longer edge should be GLYPH_DUMP_EM_SIZE + const double GLYPH_DUMP_EM_SIZE = 100.0; + double scale = GLYPH_DUMP_EM_SIZE / info.font_size_scale; + + // we choose ttf as it does not use char names + // or actually we don't use char names for ttf (see embed_font) + ffw_new_font(); + // dump each glyph into svg and combine them + for(int code = 0; code < 256; ++code) + { + if(!used_map[code]) continue; + + cairo_surface_t * surface = nullptr; + + string glyph_filename = (char*)str_fmt("%s/f%llx-%x.svg", param.tmp_dir.c_str(), fn_id, code); + tmp_files.add(glyph_filename); + + surface = cairo_svg_surface_create(glyph_filename.c_str(), transformed_bbox_width * scale, transformed_bbox_height * scale); + + cairo_svg_surface_restrict_to_version(surface, CAIRO_SVG_VERSION_1_2); + cairo_surface_set_fallback_resolution(surface, param.h_dpi, param.v_dpi); + cairo_t * cr = cairo_create(surface); + + // track the position of the origin + double ox, oy; + ox = oy = 0.0; + + auto glyph_width = ((Gfx8BitFont*)font)->getWidth(code); + +#if 1 + { + // pain the glyph + cairo_set_font_face(cr, cur_font->getFontFace()); + + cairo_matrix_t m1, m2, m3; + // set up m1 + // m1 shift the bottom-left corner of the glyph bbox to the origin + // also set font size to scale + cairo_matrix_init_translate(&m1, -transformed_bbox[0], transformed_bbox[1]); + cairo_matrix_init_scale(&m2, scale, scale); + cairo_matrix_multiply(&m1, &m1, &m2); + cairo_set_font_matrix(cr, &m1); + + cairo_glyph_t glyph; + glyph.index = cur_font->getGlyph(code, nullptr, 0); + glyph.x = 0; + glyph.y = GLYPH_DUMP_EM_SIZE; + cairo_show_glyphs(cr, &glyph, 1); + + + // apply the type 3 font's font matrix before m1 + // such that we got the mapping from type 3 font space to user space, then we will be able to calculate mapped position for ox,oy and glyph_width + cairo_matrix_init(&m2, font_matrix[0], font_matrix[1], font_matrix[2], font_matrix[3], font_matrix[4], font_matrix[5]); + cairo_matrix_init_scale(&m3, 1, -1); + cairo_matrix_multiply(&m2, &m2, &m3); + cairo_matrix_multiply(&m2, &m2, &m1); + + cairo_matrix_transform_point(&m2, &ox, &oy); + double dummy = 0; + cairo_matrix_transform_distance(&m2, &glyph_width, &dummy); + } +#else + { + // manually draw the char to get the metrics + // adapted from _render_type3_glyph of poppler + cairo_matrix_t ctm, m, m1; + cairo_matrix_init_identity(&ctm); + + // apply font-matrix + cairo_matrix_init(&m, font_matrix[0], font_matrix[1], font_matrix[2], font_matrix[3], font_matrix[4], font_matrix[5]); + cairo_matrix_multiply(&ctm, &ctm, &m); + + // shift origin + cairo_matrix_init_translate(&m1, -transformed_bbox[0], -transformed_bbox[1]); + cairo_matrix_multiply(&ctm, &ctm, &m1); + + // make it upside down since the difference between the glyph coordination and cairo coordination + cairo_matrix_init_scale(&m1, 1, -1); + cairo_matrix_multiply(&ctm, &ctm, &m1); + // save m*m1 to m1 for later use + cairo_matrix_multiply(&m1, &m, &m1); + + // shift up to the bounding box + cairo_matrix_init_translate(&m, 0.0, transformed_bbox_height); + cairo_matrix_multiply(&ctm, &ctm, &m); + + // scale up + cairo_matrix_init_scale(&m, scale, scale); + cairo_matrix_multiply(&ctm, &ctm, &m); + + // set ctm + cairo_set_matrix(cr, &ctm); + + // calculate the position of origin + cairo_matrix_transform_point(&ctm, &ox, &oy); + oy -= transformed_bbox_height * scale; + // calculate glyph width + double dummy = 0; + cairo_matrix_transform_distance(&ctm, &glyph_width, &dummy); + + // draw the glyph + auto output_dev = new CairoOutputDev(); + output_dev->setCairo(cr); + output_dev->setPrinting(true); + + PDFRectangle box; + box.x1 = font_bbox[0]; + box.y1 = font_bbox[1]; + box.x2 = font_bbox[2]; + box.y2 = font_bbox[3]; + auto gfx = new Gfx(cur_doc, output_dev, + ((Gfx8BitFont*)font)->getResources(), + &box, nullptr); + output_dev->startDoc(cur_doc, &font_engine); + output_dev->startPage(1, gfx->getState(), gfx->getXRef()); + output_dev->setInType3Char(gTrue); + auto char_procs = ((Gfx8BitFont*)font)->getCharProcs(); + Object char_proc_obj; + auto glyph_index = cur_font->getGlyph(code, nullptr, 0); + gfx->display(char_procs->getVal(glyph_index, &char_proc_obj)); + + char_proc_obj.free(); + delete gfx; + delete output_dev; + } +#endif + + { + auto status = cairo_status(cr); + cairo_destroy(cr); + if(status) + throw string("Cairo error: ") + cairo_status_to_string(status); + } + cairo_surface_finish(surface); + { + auto status = cairo_surface_status(surface); + cairo_surface_destroy(surface); + surface = nullptr; + if(status) + throw string("Error in cairo: ") + cairo_status_to_string(status); + } + + ffw_import_svg_glyph(code, glyph_filename.c_str(), ox / GLYPH_DUMP_EM_SIZE, -oy / GLYPH_DUMP_EM_SIZE, glyph_width / GLYPH_DUMP_EM_SIZE); + } + + string font_filename = (char*)str_fmt("%s/f%llx.ttf", param.tmp_dir.c_str(), fn_id); + tmp_files.add(font_filename); + ffw_save(font_filename.c_str()); + ffw_close(); + + return font_filename; +#else + return ""; +#endif +} + +void HTMLRenderer::embed_font(const string & filepath, GfxFont * font, FontInfo & info, bool get_metric_only) +{ + if(param.debug) + { + cerr << "Embed font: " << filepath << " " << info.id << endl; + } + + ffw_load_font(filepath.c_str()); + ffw_prepare_font(); + + if(param.debug) + { + auto fn = str_fmt("%s/__raw_font_%llx%s", param.tmp_dir.c_str(), info.id, get_suffix(filepath).c_str()); + tmp_files.add((char*)fn); + ofstream((char*)fn, ofstream::binary) << ifstream(filepath).rdbuf(); + } + + int * code2GID = nullptr; + int code2GID_len = 0; + int maxcode = 0; + + Gfx8BitFont * font_8bit = nullptr; + GfxCIDFont * font_cid = nullptr; + + string suffix = get_suffix(filepath); + for(auto & c : suffix) + c = tolower(c); + + /* + * if parm->tounicode is 0, try the provided tounicode map first + */ + info.use_tounicode = (param.tounicode >= 0); + bool has_space = false; + + const char * used_map = nullptr; + + info.em_size = ffw_get_em_size(); + + if(param.debug) + { + cerr << "em size: " << info.em_size << endl; + } + + info.space_width = 0; + + if(!font->isCIDFont()) + { + font_8bit = dynamic_cast<Gfx8BitFont*>(font); + } + else + { + font_cid = dynamic_cast<GfxCIDFont*>(font); + } + + if(get_metric_only) + { + ffw_fix_metric(); + ffw_get_metric(&info.ascent, &info.descent); + ffw_close(); + return; + } + + used_map = preprocessor.get_code_map(hash_ref(font->getID())); + + /* + * Step 1 + * dump the font file directly from the font descriptor and put the glyphs into the correct slots * + * + * for 8bit + nonTrueType + * re-encoding the font by glyph names + * + * for 8bit + TrueType + * sort the glpyhs as the original order, and load the code2GID table + * later we will map GID (instead of char code) to Unicode + * + * for CID + nonTrueType + * Flatten the font + * + * for CID Truetype + * same as 8bitTrueType, except for that we have to check 65536 charcodes + * use the embedded code2GID table if there is, otherwise use the one in the font + */ + if(font_8bit) + { + maxcode = 0xff; + if(is_truetype_suffix(suffix)) + { + if(info.is_type3) + { + /* + * Type 3 fonts are saved and converted into ttf fonts + * encoded based on code points instead of GID + * + * I thought code2GID would work but it never works, and I don't know why + * Anyway we can disable code2GID such that the following procedure will be working based on code points instead of GID + */ + } + else + { + ffw_reencode_glyph_order(); + if(FoFiTrueType * fftt = FoFiTrueType::load((char*)filepath.c_str())) + { + code2GID = font_8bit->getCodeToGIDMap(fftt); + code2GID_len = 256; + delete fftt; + } + } + } + else + { + // move the slot such that it's consistent with the encoding seen in PDF + unordered_set<string> nameset; + bool name_conflict_warned = false; + + std::fill(cur_mapping2.begin(), cur_mapping2.end(), (char*)nullptr); + + for(int i = 0; i < 256; ++i) + { + if(!used_map[i]) continue; + + auto cn = font_8bit->getCharName(i); + if(cn == nullptr) + { + continue; + } + else + { + if(nameset.insert(string(cn)).second) + { + cur_mapping2[i] = cn; + } + else + { + if(!name_conflict_warned) + { + name_conflict_warned = true; + //TODO: may be resolved using advanced font properties? + cerr << "Warning: encoding conflict detected in font: " << hex << info.id << dec << endl; + } + } + } + } + + ffw_reencode_raw2(cur_mapping2.data(), 256, 0); + } + } + else + { + maxcode = 0xffff; + + if(is_truetype_suffix(suffix)) + { + ffw_reencode_glyph_order(); + + GfxCIDFont * _font = dynamic_cast<GfxCIDFont*>(font); + + // To locate CID2GID for the font + // as in CairoFontEngine.cc + if((code2GID = _font->getCIDToGID())) + { + // use the mapping stored in _font + code2GID_len = _font->getCIDToGIDLen(); + } + else + { + // use the mapping stored in the file + if(FoFiTrueType * fftt = FoFiTrueType::load((char*)filepath.c_str())) + { + code2GID = _font->getCodeToGIDMap(fftt, &code2GID_len); + delete fftt; + } + } + } + else + { + // TODO: add an option to load the table? + ffw_cidflatten(); + } + } + + /* + * Step 2 + * - map charcode (or GID for CID truetype) + * + * -> Always map to Unicode for 8bit TrueType fonts and CID fonts + * + * -> For 8bit nonTruetype fonts: + * Try to calculate the correct Unicode value from the glyph names, when collision is detected in ToUnicode Map + * + * - Fill in the width_list, and set widths accordingly + */ + + + { + string map_filename; + ofstream map_outf; + if(param.debug) + { + map_filename = (char*)str_fmt("%s/f%llx.map", param.tmp_dir.c_str(), info.id); + tmp_files.add(map_filename); + map_outf.open(map_filename); + } + + unordered_set<int> codeset; + bool name_conflict_warned = false; + + auto ctu = font->getToUnicode(); + std::fill(cur_mapping.begin(), cur_mapping.end(), -1); + std::fill(width_list.begin(), width_list.end(), -1); + + if(code2GID) + maxcode = min<int>(maxcode, code2GID_len - 1); + + bool is_truetype = is_truetype_suffix(suffix); + int max_key = maxcode; + /* + * Traverse all possible codes + */ + bool retried = false; // avoid infinite loop + for(int cur_code = 0; cur_code <= maxcode; ++cur_code) + { + if(!used_map[cur_code]) + continue; + + /* + * Skip glyphs without names (only for non-ttf fonts) + */ + if(!is_truetype && (font_8bit != nullptr) + && (font_8bit->getCharName(cur_code) == nullptr)) + { + continue; + } + + int mapped_code = cur_code; + if(code2GID) + { + // for fonts with GID (e.g. TTF) we need to map GIDs instead of codes + if((mapped_code = code2GID[cur_code]) == 0) continue; + } + + if(mapped_code > max_key) + max_key = mapped_code; + + Unicode u, *pu=&u; + if(info.use_tounicode) + { + int n = ctu ? (ctu->mapToUnicode(cur_code, &pu)) : 0; + u = check_unicode(pu, n, cur_code, font); + } + else + { + u = unicode_from_font(cur_code, font); + } + + if(codeset.insert(u).second) + { + cur_mapping[mapped_code] = u; + } + else + { + // collision detected + if(param.tounicode == 0) + { + // in auto mode, just drop the tounicode map + if(!retried) + { + cerr << "ToUnicode CMap is not valid and got dropped for font: " << hex << info.id << dec << endl; + retried = true; + codeset.clear(); + info.use_tounicode = false; + std::fill(cur_mapping.begin(), cur_mapping.end(), -1); + std::fill(width_list.begin(), width_list.end(), -1); + cur_code = -1; + if(param.debug) + { + map_outf.close(); + map_outf.open(map_filename); + } + continue; + } + } + if(!name_conflict_warned) + { + name_conflict_warned = true; + //TODO: may be resolved using advanced font properties? + cerr << "Warning: encoding confliction detected in font: " << hex << info.id << dec << endl; + } + } + + { + double cur_width = 0; + if(font_8bit) + { + cur_width = font_8bit->getWidth(cur_code); + } + else + { + char buf[2]; + buf[0] = (cur_code >> 8) & 0xff; + buf[1] = (cur_code & 0xff); + cur_width = font_cid->getWidth(buf, 2) ; + } + + cur_width /= info.font_size_scale; + + if(u == ' ') + { + /* + * Internet Explorer will ignore `word-spacing` if + * the width of the 'space' glyph is 0 + * + * space_width==0 often means no spaces are used in the PDF + * so setting it to be 0.001 should be safe + */ + if(equal(cur_width, 0)) + cur_width = 0.001; + + info.space_width = cur_width; + has_space = true; + } + + width_list[mapped_code] = (int)floor(cur_width * info.em_size + 0.5); + } + + if(param.debug) + { + map_outf << hex << cur_code << ' ' << mapped_code << ' ' << u << endl; + } + } + + ffw_set_widths(width_list.data(), max_key + 1, param.stretch_narrow_glyph, param.squeeze_wide_glyph); + + ffw_reencode_raw(cur_mapping.data(), max_key + 1, 1); + + // In some space offsets in HTML, we insert a ' ' there in order to improve text copy&paste + // We need to make sure that ' ' is in the font, otherwise it would be very ugly if you select the text + // Might be a problem if ' ' is in the font, but not empty + if(!has_space) + { + if(font_8bit) + { + info.space_width = font_8bit->getWidth(' '); + } + else + { + char buf[2] = {0, ' '}; + info.space_width = font_cid->getWidth(buf, 2); + } + info.space_width /= info.font_size_scale; + + /* See comments above */ + if(equal(info.space_width,0)) + info.space_width = 0.001; + + ffw_add_empty_char((int32_t)' ', (int)floor(info.space_width * info.em_size + 0.5)); + if(param.debug) + { + cerr << "Missing space width in font " << hex << info.id << ": set to " << dec << info.space_width << endl; + } + } + + if(param.debug) + { + cerr << "space width: " << info.space_width << endl; + } + + if(ctu) + ctu->decRefCnt(); + } + + /* + * Step 3 + * Generate the font as desired + */ + + // Reencode to Unicode Full such that FontForge won't ditch unicode values larger than 0xFFFF + ffw_reencode_unicode_full(); + + // Due to a bug of Fontforge about pfa -> woff conversion + // we always generate TTF first, instead of the format specified by user + string cur_tmp_fn = (char*)str_fmt("%s/__tmp_font1.%s", param.tmp_dir.c_str(), "ttf"); + tmp_files.add(cur_tmp_fn); + string other_tmp_fn = (char*)str_fmt("%s/__tmp_font2.%s", param.tmp_dir.c_str(), "ttf"); + tmp_files.add(other_tmp_fn); + + ffw_save(cur_tmp_fn.c_str()); + + ffw_close(); + + /* + * Step 4 + * Font Hinting + */ + bool hinted = false; + + // Call external hinting program if specified + if(param.external_hint_tool != "") + { + hinted = (system((char*)str_fmt("%s \"%s\" \"%s\"", param.external_hint_tool.c_str(), cur_tmp_fn.c_str(), other_tmp_fn.c_str())) == 0); + } + + // Call internal hinting procedure if specified + if((!hinted) && (param.auto_hint)) + { + ffw_load_font(cur_tmp_fn.c_str()); + ffw_auto_hint(); + ffw_save(other_tmp_fn.c_str()); + ffw_close(); + hinted = true; + } + + if(hinted) + { + swap(cur_tmp_fn, other_tmp_fn); + } + + /* + * Step 5 + * Generate the font, load the metrics and set the embedding bits (fstype) + * + * Ascent/Descent are not used in PDF, and the values in PDF may be wrong or inconsistent (there are 3 sets of them) + * We need to reload in order to retrieve/fix accurate ascent/descent, some info won't be written to the font by fontforge until saved. + */ + string fn = (char*)str_fmt("%s/f%llx.%s", + (param.embed_font ? param.tmp_dir : param.dest_dir).c_str(), + info.id, param.font_format.c_str()); + + if(param.embed_font) + tmp_files.add(fn); + + ffw_load_font(cur_tmp_fn.c_str()); + ffw_fix_metric(); + ffw_get_metric(&info.ascent, &info.descent); + if(param.override_fstype) + ffw_override_fstype(); + ffw_save(fn.c_str()); + + ffw_close(); +} + + +const FontInfo * HTMLRenderer::install_font(GfxFont * font) +{ + assert(sizeof(long long) == 2*sizeof(int)); + + long long fn_id = (font == nullptr) ? 0 : hash_ref(font->getID()); + + auto iter = font_info_map.find(fn_id); + if(iter != font_info_map.end()) + return &(iter->second); + + long long new_fn_id = font_info_map.size(); + + auto cur_info_iter = font_info_map.insert(make_pair(fn_id, FontInfo())).first; + + FontInfo & new_font_info = cur_info_iter->second; + new_font_info.id = new_fn_id; + new_font_info.use_tounicode = true; + new_font_info.font_size_scale = 1.0; + + if(font == nullptr) + { + new_font_info.em_size = 0; + new_font_info.space_width = 0; + new_font_info.ascent = 0; + new_font_info.descent = 0; + new_font_info.is_type3 = false; + + export_remote_default_font(new_fn_id); + + return &(new_font_info); + } + + new_font_info.ascent = font->getAscent(); + new_font_info.descent = font->getDescent(); + new_font_info.is_type3 = (font->getType() == fontType3); + + if(param.debug) + { + cerr << "Install font " << hex << new_fn_id << dec + << ": (" << (font->getID()->num) << ' ' << (font->getID()->gen) << ") " + << (font->getName() ? font->getName()->getCString() : "") + << endl; + } + + if(new_font_info.is_type3) + { +#if ENABLE_SVG + if(param.process_type3) + { + install_embedded_font(font, new_font_info); + } + else + { + export_remote_default_font(new_fn_id); + } +#else + cerr << "Type 3 fonts are unsupported and will be rendered as Image" << endl; + export_remote_default_font(new_fn_id); +#endif + return &new_font_info; + } + if(font->getWMode()) { + cerr << "Writing mode is unsupported and will be rendered as Image" << endl; + export_remote_default_font(new_fn_id); + return &new_font_info; + } + + /* + * The 2nd parameter of locateFont should be true only for PS + * which does not make much sense in our case + * If we specify gFalse here, font_loc->locaType cannot be gfxFontLocResident + */ + if(auto * font_loc = font->locateFont(xref, nullptr)) + { + switch(font_loc -> locType) + { + case gfxFontLocEmbedded: + install_embedded_font(font, new_font_info); + break; + case gfxFontLocResident: + std::cerr << "Warning: Base 14 fonts should not be specially handled now. Please report a bug!" << std::endl; + /* fall through */ + case gfxFontLocExternal: + install_external_font(font, new_font_info); + break; + default: + cerr << "TODO: other font loc" << endl; + export_remote_default_font(new_fn_id); + break; + } + delete font_loc; + } + else + { + export_remote_default_font(new_fn_id); + } + + return &new_font_info; +} + +void HTMLRenderer::install_embedded_font(GfxFont * font, FontInfo & info) +{ + auto path = dump_embedded_font(font, info); + + if(path != "") + { + embed_font(path, font, info); + export_remote_font(info, param.font_format, font); + } + else + { + export_remote_default_font(info.id); + } +} + +void HTMLRenderer::install_external_font(GfxFont * font, FontInfo & info) +{ + string fontname(font->getName()->getCString()); + + // resolve bad encodings in GB + auto iter = GB_ENCODED_FONT_NAME_MAP.find(fontname); + if(iter != GB_ENCODED_FONT_NAME_MAP.end()) + { + fontname = iter->second; + cerr << "Warning: workaround for font names in bad encodings." << endl; + } + + GfxFontLoc * localfontloc = font->locateFont(xref, nullptr); + + if(param.embed_external_font) + { + if(localfontloc != nullptr) + { + embed_font(string(localfontloc->path->getCString()), font, info); + export_remote_font(info, param.font_format, font); + delete localfontloc; + return; + } + else + { + cerr << "Cannot embed external font: f" << hex << info.id << dec << ' ' << fontname << endl; + // fallback to exporting by name + } + } + + // still try to get an idea of read ascent/descent + if(localfontloc != nullptr) + { + // fill in ascent/descent only, do not embed + embed_font(string(localfontloc->path->getCString()), font, info, true); + delete localfontloc; + } + else + { + info.ascent = font->getAscent(); + info.descent = font->getDescent(); + } + + export_local_font(info, font, fontname, ""); +} + +void HTMLRenderer::export_remote_font(const FontInfo & info, const string & format, GfxFont * font) +{ + string css_font_format; + if(format == "ttf") + { + css_font_format = "truetype"; + } + else if(format == "otf") + { + css_font_format = "opentype"; + } + else if(format == "woff") + { + css_font_format = "woff"; + } + else if(format == "eot") + { + css_font_format = "embedded-opentype"; + } + else if(format == "svg") + { + css_font_format = "svg"; + } + else + { + throw string("Warning: unknown font format: ") + format; + } + auto iter = FORMAT_MIME_TYPE_MAP.find(format); + if(iter == FORMAT_MIME_TYPE_MAP.end()) + { + throw string("Warning: unknown font format: ") + format; + } + string mime_type = iter->second; + + f_css.fs << "@font-face{" + << "font-family:" << CSS::FONT_FAMILY_CN << info.id << ";" + << "src:url("; + + { + auto fn = str_fmt("f%llx.%s", info.id, format.c_str()); + if(param.embed_font) + { + auto path = param.tmp_dir + "/" + (char*)fn; + ifstream fin(path, ifstream::binary); + if(!fin) + throw "Cannot locate font file: " + path; + f_css.fs << "'data:" + mime_type + ";base64," << Base64Stream(fin) << "'"; + } + else + { + f_css.fs << (char*)fn; + } + } + + f_css.fs << ")" + << "format(\"" << css_font_format << "\");" + << "}" // end of @font-face + << "." << CSS::FONT_FAMILY_CN << info.id << "{" + << "font-family:" << CSS::FONT_FAMILY_CN << info.id << ";" + << "line-height:" << round(info.ascent - info.descent) << ";" + << "font-style:normal;" + << "font-weight:normal;" + << "visibility:visible;" + << "}" + << endl; +} + +static string general_font_family(GfxFont * font) +{ + if(font->isFixedWidth()) + return "monospace"; + else if (font->isSerif()) + return "serif"; + else + return "sans-serif"; +} + +// TODO: this function is called when some font is unable to process, may use the name there as a hint +void HTMLRenderer::export_remote_default_font(long long fn_id) +{ + f_css.fs << "." << CSS::FONT_FAMILY_CN << fn_id << "{font-family:sans-serif;visibility:hidden;}" << endl; +} + +void HTMLRenderer::export_local_font(const FontInfo & info, GfxFont * font, const string & original_font_name, const string & cssfont) +{ + f_css.fs << "." << CSS::FONT_FAMILY_CN << info.id << "{"; + f_css.fs << "font-family:" << ((cssfont == "") ? (original_font_name + "," + general_font_family(font)) : cssfont) << ";"; + + string fn = original_font_name; + for(auto & c : fn) + c = tolower(c); + + if(font->isBold() || (fn.find("bold") != string::npos)) + f_css.fs << "font-weight:bold;"; + else + f_css.fs << "font-weight:normal;"; + + if(fn.find("oblique") != string::npos) + f_css.fs << "font-style:oblique;"; + else if(font->isItalic() || (fn.find("italic") != string::npos)) + f_css.fs << "font-style:italic;"; + else + f_css.fs << "font-style:normal;"; + + f_css.fs << "line-height:" << round(info.ascent - info.descent) << ";"; + + f_css.fs << "visibility:visible;"; + + f_css.fs << "}" << endl; +} + +} //namespace pdf2htmlEX diff --git a/src/HTMLRenderer/form.cc b/src/HTMLRenderer/form.cc new file mode 100644 index 0000000..6b51622 --- /dev/null +++ b/src/HTMLRenderer/form.cc @@ -0,0 +1,76 @@ +/* + * form.cc + * + * Handling Forms + * + * by Simon Chenard + * 2014.07.25 + */ + +#include <iostream> +#include <sstream> +#include <string> + +#include "HTMLRenderer.h" +#include "util/namespace.h" +#include "util/misc.h" + +namespace pdf2htmlEX { + +using std::ofstream; +using std::cerr; + +void HTMLRenderer::process_form(ofstream & out) +{ + FormPageWidgets * widgets = cur_catalog->getPage(pageNum)->getFormWidgets(); + int num = widgets->getNumWidgets(); + + for(int i = 0; i < num; i++) + { + FormWidget * w = widgets->getWidget(i); + double x1, y1, x2, y2; + + w->getRect(&x1, &y1, &x2, &y2); + x1 = x1 * param.zoom; + x2 = x2 * param.zoom; + y1 = y1 * param.zoom; + y2 = y2 * param.zoom; + + double width = x2 - x1; + double height = y2 - y1; + + if(w->getType() == formText) + { + double font_size = height / 2; + + out << "<input id=\"text-" << pageNum << "-" << i + << "\" class=\"" << CSS::INPUT_TEXT_CN + << "\" type=\"text\" value=\"\"" + << " style=\"position: absolute; left: " << x1 + << "px; bottom: " << y1 << "px;" + << " width: " << width << "px; height: " << std::to_string(height) + << "px; line-height: " << std::to_string(height) << "px; font-size: " + << font_size << "px;\" />" << endl; + } + else if(w->getType() == formButton) + { + //Ideally would check w->getButtonType() + //for more specific rendering + width += 3; + height += 3; + + out << "<div id=\"cb-" << pageNum << "-" << i + << "\" class=\"" << CSS::INPUT_RADIO_CN + << "\" style=\"position: absolute; left: " << x1 + << "px; bottom: " << y1 << "px;" + << " width: " << width << "px; height: " + << std::to_string(height) << "px; background-size: cover;\" ></div>" << endl; + } + else + { + cerr << "Unsupported form field detected" << endl; + } + } +} + +} diff --git a/src/HTMLRenderer/general.cc b/src/HTMLRenderer/general.cc new file mode 100644 index 0000000..6a54194 --- /dev/null +++ b/src/HTMLRenderer/general.cc @@ -0,0 +1,592 @@ +/* + * general.cc + * + * Handling general stuffs + * + * Copyright (C) 2012,2013,2014 Lu Wang <coolwanglu@gmail.com> + */ + +#include <cstdio> +#include <ostream> +#include <cmath> +#include <algorithm> +#include <vector> +#include <functional> + +#include <GlobalParams.h> + +#include "pdf2htmlEX-config.h" +#include "HTMLRenderer.h" +#include "HTMLTextLine.h" +#include "Base64Stream.h" + +#include "BackgroundRenderer/BackgroundRenderer.h" + +#include "util/namespace.h" +#include "util/ffw.h" +#include "util/math.h" +#include "util/path.h" +#include "util/css_const.h" +#include "util/encoding.h" + +namespace pdf2htmlEX { + +using std::fixed; +using std::flush; +using std::ostream; +using std::max; +using std::min_element; +using std::vector; +using std::abs; +using std::cerr; +using std::endl; + +HTMLRenderer::HTMLRenderer(const Param & param) + :OutputDev() + ,param(param) + ,html_text_page(param, all_manager) + ,preprocessor(param) + ,tmp_files(param) + ,tracer(param) +{ + if(!(param.debug)) + { + //disable error messages of poppler + globalParams->setErrQuiet(gTrue); + } + + ffw_init(param.debug); + + cur_mapping.resize(0x10000); + cur_mapping2.resize(0x100); + width_list.resize(0x10000); + + /* + * For these states, usually the error will not be accumulated + * or may be handled well (whitespace_manager) + * So we can set a large eps here + */ + all_manager.vertical_align.set_eps(param.v_eps); + all_manager.whitespace .set_eps(param.h_eps); + all_manager.left .set_eps(param.h_eps); + /* + * For other states, we need accurate values + * optimization will be done separately + */ + all_manager.font_size .set_eps(EPS); + all_manager.letter_space.set_eps(EPS); + all_manager.word_space .set_eps(EPS); + all_manager.height .set_eps(EPS); + all_manager.width .set_eps(EPS); + all_manager.bottom .set_eps(EPS); + + tracer.on_char_drawn = + [this](double * box) { covered_text_detector.add_char_bbox(box); }; + tracer.on_char_clipped = + [this](double * box, bool partial) { covered_text_detector.add_char_bbox_clipped(box, partial); }; + tracer.on_non_char_drawn = + [this](double * box) { covered_text_detector.add_non_char_bbox(box); }; +} + +HTMLRenderer::~HTMLRenderer() +{ + ffw_finalize(); +} + +void HTMLRenderer::process(PDFDoc *doc) +{ + cur_doc = doc; + cur_catalog = doc->getCatalog(); + xref = doc->getXRef(); + + pre_process(doc); + + /////////////////// + // Process pages + + if(param.process_nontext) + { + bg_renderer = BackgroundRenderer::getBackgroundRenderer(param.bg_format, this, param); + if(!bg_renderer) + throw "Cannot initialize background renderer, unsupported format"; + bg_renderer->init(doc); + + fallback_bg_renderer = BackgroundRenderer::getFallbackBackgroundRenderer(this, param); + if (fallback_bg_renderer) + fallback_bg_renderer->init(doc); + } + + int page_count = (param.last_page - param.first_page + 1); + for(int i = param.first_page; i <= param.last_page ; ++i) + { + if (param.tmp_file_size_limit != -1 && tmp_files.get_total_size() > param.tmp_file_size_limit * 1024) { + cerr << "Stop processing, reach max size\n"; + break; + } + + cerr << "Working: " << (i-param.first_page) << "/" << page_count << '\r' << flush; + + if(param.split_pages) + { + // copy the string out, since we will reuse the buffer soon + string filled_template_filename = (char*)str_fmt(param.page_filename.c_str(), i); + auto page_fn = str_fmt("%s/%s", param.dest_dir.c_str(), filled_template_filename.c_str()); + f_curpage = new ofstream((char*)page_fn, ofstream::binary); + if(!(*f_curpage)) + throw string("Cannot open ") + (char*)page_fn + " for writing"; + set_stream_flags((*f_curpage)); + + cur_page_filename = filled_template_filename; + } + + doc->displayPage(this, i, + text_zoom_factor() * DEFAULT_DPI, text_zoom_factor() * DEFAULT_DPI, + 0, + (!(param.use_cropbox)), + true, // crop + false, // printing + nullptr, nullptr, nullptr, nullptr); + + if(param.split_pages) + { + delete f_curpage; + f_curpage = nullptr; + } + } + if(page_count >= 0) + cerr << "Working: " << page_count << "/" << page_count; + cerr << endl; + + //////////////////////// + // Process Outline + if(param.process_outline) + process_outline(); + + post_process(); + + bg_renderer = nullptr; + fallback_bg_renderer = nullptr; + + cerr << endl; +} + +void HTMLRenderer::setDefaultCTM(double *ctm) +{ + memcpy(default_ctm, ctm, sizeof(default_ctm)); +} + +void HTMLRenderer::startPage(int pageNum, GfxState *state, XRef * xref) +{ + covered_text_detector.reset(); + tracer.reset(state); + + this->pageNum = pageNum; + + html_text_page.set_page_size(state->getPageWidth(), state->getPageHeight()); + + reset_state(); +} + +void HTMLRenderer::endPage() { + long long wid = all_manager.width.install(html_text_page.get_width()); + long long hid = all_manager.height.install(html_text_page.get_height()); + + (*f_curpage) + << "<div id=\"" << CSS::PAGE_FRAME_CN << pageNum + << "\" class=\"" << CSS::PAGE_FRAME_CN + << " " << CSS::WIDTH_CN << wid + << " " << CSS::HEIGHT_CN << hid + << "\" data-page-no=\"" << pageNum << "\">" + << "<div class=\"" << CSS::PAGE_CONTENT_BOX_CN + << " " << CSS::PAGE_CONTENT_BOX_CN << pageNum + << " " << CSS::WIDTH_CN << wid + << " " << CSS::HEIGHT_CN << hid + << "\">"; + + /* + * When split_pages is on, f_curpage points to the current page file + * and we want to output empty frames in f_pages.fs + */ + if(param.split_pages) + { + f_pages.fs + << "<div id=\"" << CSS::PAGE_FRAME_CN << pageNum + << "\" class=\"" << CSS::PAGE_FRAME_CN + << " " << CSS::WIDTH_CN << wid + << " " << CSS::HEIGHT_CN << hid + << "\" data-page-no=\"" << pageNum + << "\" data-page-url=\""; + + writeAttribute(f_pages.fs, cur_page_filename); + f_pages.fs << "\">"; + } + + if(param.process_nontext) + { + if (bg_renderer->render_page(cur_doc, pageNum)) + { + bg_renderer->embed_image(pageNum); + } + else if (fallback_bg_renderer) + { + if (fallback_bg_renderer->render_page(cur_doc, pageNum)) + fallback_bg_renderer->embed_image(pageNum); + } + } + + // dump all text + html_text_page.dump_text(*f_curpage); + html_text_page.dump_css(f_css.fs); + html_text_page.clear(); + + // process form + if(param.process_form) + process_form(*f_curpage); + + // process links before the page is closed + cur_doc->processLinks(this, pageNum); + + // close box + (*f_curpage) << "</div>"; + + // dump info for js + // TODO: create a function for this + // BE CAREFUL WITH ESCAPES + { + (*f_curpage) << "<div class=\"" << CSS::PAGE_DATA_CN << "\" data-data='{"; + + //default CTM + (*f_curpage) << "\"ctm\":["; + for(int i = 0; i < 6; ++i) + { + if(i > 0) (*f_curpage) << ","; + (*f_curpage) << round(default_ctm[i]); + } + (*f_curpage) << "]"; + + (*f_curpage) << "}'></div>"; + } + + // close page + (*f_curpage) << "</div>" << endl; + + if(param.split_pages) + { + f_pages.fs << "</div>" << endl; + } +} + +void HTMLRenderer::pre_process(PDFDoc * doc) +{ + preprocessor.process(doc); + + /* + * determine scale factors + */ + { + vector<double> zoom_factors; + + if(is_positive(param.zoom)) + { + zoom_factors.push_back(param.zoom); + } + + if(is_positive(param.fit_width)) + { + zoom_factors.push_back((param.fit_width) / preprocessor.get_max_width()); + } + + if(is_positive(param.fit_height)) + { + zoom_factors.push_back((param.fit_height) / preprocessor.get_max_height()); + } + + double zoom = (zoom_factors.empty() ? 1.0 : (*min_element(zoom_factors.begin(), zoom_factors.end()))); + + text_scale_factor1 = max<double>(zoom, param.font_size_multiplier); + text_scale_factor2 = zoom / text_scale_factor1; + } + + // we may output utf8 characters, so always use binary + { + /* + * If embed-css + * we have to keep the generated css file into a temporary place + * and embed it into the main html later + * + * otherwise + * leave it in param.dest_dir + */ + + auto fn = (param.embed_css) + ? str_fmt("%s/__css", param.tmp_dir.c_str()) + : str_fmt("%s/%s", param.dest_dir.c_str(), param.css_filename.c_str()); + + if(param.embed_css) + tmp_files.add((char*)fn); + + f_css.path = (char*)fn; + f_css.fs.open(f_css.path, ofstream::binary); + if(!f_css.fs) + throw string("Cannot open ") + (char*)fn + " for writing"; + set_stream_flags(f_css.fs); + } + + if (param.process_outline) + { + /* + * The logic for outline is similar to css + */ + + auto fn = (param.embed_outline) + ? str_fmt("%s/__outline", param.tmp_dir.c_str()) + : str_fmt("%s/%s", param.dest_dir.c_str(), param.outline_filename.c_str()); + + if(param.embed_outline) + tmp_files.add((char*)fn); + + f_outline.path = (char*)fn; + f_outline.fs.open(f_outline.path, ofstream::binary); + if(!f_outline.fs) + throw string("Cannot open") + (char*)fn + " for writing"; + + // might not be necessary + set_stream_flags(f_outline.fs); + } + + { + /* + * we have to keep the html file for pages into a temporary place + * because we'll have to embed css before it + * + * Otherwise just generate it + */ + auto fn = str_fmt("%s/__pages", param.tmp_dir.c_str()); + tmp_files.add((char*)fn); + + f_pages.path = (char*)fn; + f_pages.fs.open(f_pages.path, ofstream::binary); + if(!f_pages.fs) + throw string("Cannot open ") + (char*)fn + " for writing"; + set_stream_flags(f_pages.fs); + } + + if(param.split_pages) + { + f_curpage = nullptr; + } + else + { + f_curpage = &f_pages.fs; + } +} + +void HTMLRenderer::post_process(void) +{ + dump_css(); + + // close files if they opened + if (param.process_outline) + { + f_outline.fs.close(); + } + f_pages.fs.close(); + f_css.fs.close(); + + // build the main HTML file + ofstream output; + { + auto fn = str_fmt("%s/%s", param.dest_dir.c_str(), param.output_filename.c_str()); + output.open((char*)fn, ofstream::binary); + if(!output) + throw string("Cannot open ") + (char*)fn + " for writing"; + set_stream_flags(output); + } + + // apply manifest + ifstream manifest_fin((char*)str_fmt("%s/%s", param.data_dir.c_str(), MANIFEST_FILENAME.c_str()), ifstream::binary); + if(!manifest_fin) + throw "Cannot open the manifest file"; + + bool embed_string = false; + string line; + long line_no = 0; + while(getline(manifest_fin, line)) + { + // trim space at both sides + { + static const char * whitespaces = " \t\n\v\f\r"; + auto idx1 = line.find_first_not_of(whitespaces); + if(idx1 == string::npos) + { + line.clear(); + } + else + { + auto idx2 = line.find_last_not_of(whitespaces); + assert(idx2 >= idx1); + line = line.substr(idx1, idx2 - idx1 + 1); + } + } + + ++line_no; + + if(line == "\"\"\"") + { + embed_string = !embed_string; + continue; + } + + if(embed_string) + { + output << line << endl; + continue; + } + + if(line.empty() || line[0] == '#') + continue; + + + if(line[0] == '@') + { + embed_file(output, param.data_dir + "/" + line.substr(1), "", true); + continue; + } + + if(line[0] == '$') + { + if(line == "$css") + { + embed_file(output, f_css.path, ".css", false); + } + else if (line == "$outline") + { + if (param.process_outline && param.embed_outline) + { + ifstream fin(f_outline.path, ifstream::binary); + if(!fin) + throw "Cannot open outline for reading"; + output << fin.rdbuf(); + output.clear(); // output will set fail big if fin is empty + } + } + else if (line == "$pages") + { + ifstream fin(f_pages.path, ifstream::binary); + if(!fin) + throw "Cannot open pages for reading"; + output << fin.rdbuf(); + output.clear(); // output will set fail bit if fin is empty + } + else + { + cerr << "Warning: manifest line " << line_no << ": Unknown content \"" << line << "\"" << endl; + } + continue; + } + + cerr << "Warning: unknown line in manifest: " << line << endl; + } +} + +void HTMLRenderer::set_stream_flags(std::ostream & out) +{ + // we output all ID's in hex + // browsers are not happy with scientific notations + out << hex << fixed; +} + +void HTMLRenderer::dump_css (void) +{ + all_manager.transform_matrix.dump_css(f_css.fs); + all_manager.vertical_align .dump_css(f_css.fs); + all_manager.letter_space .dump_css(f_css.fs); + all_manager.stroke_color .dump_css(f_css.fs); + all_manager.word_space .dump_css(f_css.fs); + all_manager.whitespace .dump_css(f_css.fs); + all_manager.fill_color .dump_css(f_css.fs); + all_manager.font_size .dump_css(f_css.fs); + all_manager.bottom .dump_css(f_css.fs); + all_manager.height .dump_css(f_css.fs); + all_manager.width .dump_css(f_css.fs); + all_manager.left .dump_css(f_css.fs); + all_manager.bgimage_size .dump_css(f_css.fs); + + // print css + if(param.printing) + { + double ps = print_scale(); + f_css.fs << CSS::PRINT_ONLY << "{" << endl; + all_manager.transform_matrix.dump_print_css(f_css.fs, ps); + all_manager.vertical_align .dump_print_css(f_css.fs, ps); + all_manager.letter_space .dump_print_css(f_css.fs, ps); + all_manager.stroke_color .dump_print_css(f_css.fs, ps); + all_manager.word_space .dump_print_css(f_css.fs, ps); + all_manager.whitespace .dump_print_css(f_css.fs, ps); + all_manager.fill_color .dump_print_css(f_css.fs, ps); + all_manager.font_size .dump_print_css(f_css.fs, ps); + all_manager.bottom .dump_print_css(f_css.fs, ps); + all_manager.height .dump_print_css(f_css.fs, ps); + all_manager.width .dump_print_css(f_css.fs, ps); + all_manager.left .dump_print_css(f_css.fs, ps); + all_manager.bgimage_size .dump_print_css(f_css.fs, ps); + f_css.fs << "}" << endl; + } +} + +void HTMLRenderer::embed_file(ostream & out, const string & path, const string & type, bool copy) +{ + string fn = get_filename(path); + string suffix = (type == "") ? get_suffix(fn) : type; + + auto iter = EMBED_STRING_MAP.find(suffix); + if(iter == EMBED_STRING_MAP.end()) + { + cerr << "Warning: unknown suffix: " << suffix << endl; + return; + } + + const auto & entry = iter->second; + + if(param.*(entry.embed_flag)) + { + ifstream fin(path, ifstream::binary); + if(!fin) + throw string("Cannot open file ") + path + " for embedding"; + out << entry.prefix_embed; + + if(entry.base64_encode) + { + out << Base64Stream(fin); + } + else + { + out << endl << fin.rdbuf(); + } + out.clear(); // out will set fail big if fin is empty + out << entry.suffix_embed << endl; + } + else + { + out << entry.prefix_external; + writeAttribute(out, fn); + out << entry.suffix_external << endl; + + if(copy) + { + ifstream fin(path, ifstream::binary); + if(!fin) + throw string("Cannot copy file: ") + path; + auto out_path = param.dest_dir + "/" + fn; + ofstream out(out_path, ofstream::binary); + if(!out) + throw string("Cannot open file ") + path + " for embedding"; + out << fin.rdbuf(); + out.clear(); // out will set fail big if fin is empty + } + } +} + +const std::string HTMLRenderer::MANIFEST_FILENAME = "manifest"; + +}// namespace pdf2htmlEX diff --git a/src/HTMLRenderer/image.cc b/src/HTMLRenderer/image.cc new file mode 100644 index 0000000..91ca767 --- /dev/null +++ b/src/HTMLRenderer/image.cc @@ -0,0 +1,83 @@ +/* + * image.cc + * + * Handling images + * + * by WangLu + * 2012.08.14 + */ + +#include "HTMLRenderer.h" +#include "util/namespace.h" + +namespace pdf2htmlEX { + +void HTMLRenderer::drawImage(GfxState * state, Object * ref, Stream * str, int width, int height, GfxImageColorMap * colorMap, GBool interpolate, int *maskColors, GBool inlineImg) +{ + tracer.draw_image(state); + + return OutputDev::drawImage(state,ref,str,width,height,colorMap,interpolate,maskColors,inlineImg); + +#if 0 + if(maskColors) + return; + + rgb8_image_t img(width, height); + auto imgview = view(img); + auto loc = imgview.xy_at(0,0); + + ImageStream * img_stream = new ImageStream(str, width, colorMap->getNumPixelComps(), colorMap->getBits()); + img_stream->reset(); + + for(int i = 0; i < height; ++i) + { + auto p = img_stream->getLine(); + for(int j = 0; j < width; ++j) + { + GfxRGB rgb; + colorMap->getRGB(p, &rgb); + + *loc = rgb8_pixel_t(colToByte(rgb.r), colToByte(rgb.g), colToByte(rgb.b)); + + p += colorMap->getNumPixelComps(); + + ++ loc.x(); + } + + loc = imgview.xy_at(0, i+1); + } + + png_write_view((format("i%|1$x|.png")%image_count).str(), imgview); + + img_stream->close(); + delete img_stream; + + close_line(); + + double ctm[6]; + memcpy(ctm, state->getCTM(), sizeof(ctm)); + ctm[4] = ctm[5] = 0.0; + html_fout << format("<img class=\"i t%2%\" style=\"left:%3%px;bottom:%4%px;width:%5%px;height:%6%px;\" src=\"i%|1$x|.png\" />") % image_count % install_transform_matrix(ctm) % state->getCurX() % state->getCurY() % width % height << endl; + + + ++ image_count; +#endif +} + +void HTMLRenderer::drawSoftMaskedImage(GfxState *state, Object *ref, Stream *str, + int width, int height, + GfxImageColorMap *colorMap, + GBool interpolate, + Stream *maskStr, + int maskWidth, int maskHeight, + GfxImageColorMap *maskColorMap, + GBool maskInterpolate) +{ + tracer.draw_image(state); + + return OutputDev::drawSoftMaskedImage(state,ref,str, // TODO really required? + width,height,colorMap,interpolate, + maskStr, maskWidth, maskHeight, maskColorMap, maskInterpolate); +} + +} // namespace pdf2htmlEX diff --git a/src/HTMLRenderer/link.cc b/src/HTMLRenderer/link.cc new file mode 100644 index 0000000..3c90ab5 --- /dev/null +++ b/src/HTMLRenderer/link.cc @@ -0,0 +1,309 @@ +/* + * link.cc + * + * Handling links + * + * by WangLu + * 2012.09.25 + */ + +#include <iostream> +#include <sstream> +#include <algorithm> + +#include <Link.h> + +#include "HTMLRenderer.h" +#include "util/namespace.h" +#include "util/math.h" +#include "util/misc.h" +#include "util/encoding.h" +#include "util/css_const.h" + +namespace pdf2htmlEX { + +using std::ostringstream; +using std::min; +using std::max; +using std::cerr; +using std::endl; + +/* + * The detailed rectangle area of the link destination + * Will be parsed and performed by Javascript + * The string will be put into a HTML attribute, surrounded by single quotes + * So pay attention to the characters used here + */ +static string get_linkdest_detail_str(LinkDest * dest, Catalog * catalog, int & pageno) +{ + pageno = 0; + if(dest->isPageRef()) + { + auto pageref = dest->getPageRef(); + pageno = catalog->findPage(pageref.num, pageref.gen); + } + else + { + pageno = dest->getPageNum(); + } + + if(pageno <= 0) + { + return ""; + } + + ostringstream sout; + // dec + sout << "[" << pageno; + + if(dest) + { + switch(dest->getKind()) + { + case destXYZ: + { + sout << ",\"XYZ\","; + if(dest->getChangeLeft()) + sout << (dest->getLeft()); + else + sout << "null"; + sout << ","; + if(dest->getChangeTop()) + sout << (dest->getTop()); + else + sout << "null"; + sout << ","; + if(dest->getChangeZoom()) + sout << (dest->getZoom()); + else + sout << "null"; + } + break; + case destFit: + sout << ",\"Fit\""; + break; + case destFitH: + sout << ",\"FitH\","; + if(dest->getChangeTop()) + sout << (dest->getTop()); + else + sout << "null"; + break; + case destFitV: + sout << ",\"FitV\","; + if(dest->getChangeLeft()) + sout << (dest->getLeft()); + else + sout << "null"; + break; + case destFitR: + sout << ",\"FitR\"," + << (dest->getLeft()) << "," + << (dest->getBottom()) << "," + << (dest->getRight()) << "," + << (dest->getTop()); + break; + case destFitB: + sout << ",\"FitB\""; + break; + case destFitBH: + sout << ",\"FitBH\","; + if(dest->getChangeTop()) + sout << (dest->getTop()); + else + sout << "null"; + break; + case destFitBV: + sout << ",\"FitBV\","; + if(dest->getChangeLeft()) + sout << (dest->getLeft()); + else + sout << "null"; + break; + default: + break; + } + } + sout << "]"; + + return sout.str(); +} + +string HTMLRenderer::get_linkaction_str(LinkAction * action, string & detail) +{ + string dest_str; + detail = ""; + if(action) + { + auto kind = action->getKind(); + switch(kind) + { + case actionGoTo: + { + auto * real_action = dynamic_cast<LinkGoTo*>(action); + LinkDest * dest = nullptr; + if(auto _ = real_action->getDest()) + dest = _->copy(); + else if (auto _ = real_action->getNamedDest()) + dest = cur_catalog->findDest(_); + if(dest) + { + int pageno = 0; + detail = get_linkdest_detail_str(dest, cur_catalog, pageno); + if(pageno > 0) + { + dest_str = (char*)str_fmt("#%s%x", CSS::PAGE_FRAME_CN, pageno); + } + delete dest; + } + } + break; + case actionGoToR: + { + cerr << "TODO: actionGoToR is not implemented." << endl; + } + break; + case actionURI: + { + auto * real_action = dynamic_cast<LinkURI*>(action); + dest_str = real_action->getURI()->getCString(); + } + break; + case actionLaunch: + { + cerr << "TODO: actionLaunch is not implemented." << endl; + } + break; + default: + cerr << "Warning: unknown annotation type: " << kind << endl; + break; + } + } + + return dest_str; +} + +/* + * Based on pdftohtml from poppler + * TODO: share rectangle draw with css-draw + */ +void HTMLRenderer::processLink(AnnotLink * al) +{ + string dest_detail_str; + string dest_str = get_linkaction_str(al->getAction(), dest_detail_str); + + if(!dest_str.empty()) + { + (*f_curpage) << "<a class=\"" << CSS::LINK_CN << "\" href=\""; + writeAttribute((*f_curpage), dest_str); + (*f_curpage) << "\""; + + if(!dest_detail_str.empty()) + (*f_curpage) << " data-dest-detail='" << dest_detail_str << "'"; + + (*f_curpage) << ">"; + } + + (*f_curpage) << "<div class=\"" << CSS::CSS_DRAW_CN << ' ' << CSS::TRANSFORM_MATRIX_CN + << all_manager.transform_matrix.install(default_ctm) + << "\" style=\""; + + double x,y,w,h; + double x1, y1, x2, y2; + al->getRect(&x1, &y1, &x2, &y2); + x = min<double>(x1, x2); + y = min<double>(y1, y2); + w = max<double>(x1, x2) - x; + h = max<double>(y1, y2) - y; + + double border_width = 0; + double border_top_bottom_width = 0; + double border_left_right_width = 0; + auto * border = al->getBorder(); + if(border) + { + border_width = border->getWidth(); + if(border_width > 0) + { + { + css_fix_rectangle_border_width(x1, y1, x2, y2, border_width, + x, y, w, h, + border_top_bottom_width, border_left_right_width); + + if(std::abs(border_top_bottom_width - border_left_right_width) < EPS) + (*f_curpage) << "border-width:" << round(border_top_bottom_width) << "px;"; + else + (*f_curpage) << "border-width:" << round(border_top_bottom_width) << "px " << round(border_left_right_width) << "px;"; + } + auto style = border->getStyle(); + switch(style) + { + case AnnotBorder::borderSolid: + (*f_curpage) << "border-style:solid;"; + break; + case AnnotBorder::borderDashed: + (*f_curpage) << "border-style:dashed;"; + break; + case AnnotBorder::borderBeveled: + (*f_curpage) << "border-style:outset;"; + break; + case AnnotBorder::borderInset: + (*f_curpage) << "border-style:inset;"; + break; + case AnnotBorder::borderUnderlined: + (*f_curpage) << "border-style:none;border-bottom-style:solid;"; + break; + default: + cerr << "Warning:Unknown annotation border style: " << style << endl; + (*f_curpage) << "border-style:solid;"; + } + + + auto color = al->getColor(); + double r,g,b; + if(color && (color->getSpace() == AnnotColor::colorRGB)) + { + const double * v = color->getValues(); + r = v[0]; + g = v[1]; + b = v[2]; + } + else + { + r = g = b = 0; + } + + (*f_curpage) << "border-color:rgb(" + << dec << (int)dblToByte(r) << "," << (int)dblToByte(g) << "," << (int)dblToByte(b) << hex + << ");"; + } + else + { + (*f_curpage) << "border-style:none;"; + } + } + else + { + (*f_curpage) << "border-style:none;"; + } + + tm_transform(default_ctm, x, y); + + (*f_curpage) << "position:absolute;" + << "left:" << round(x) << "px;" + << "bottom:" << round(y) << "px;" + << "width:" << round(w) << "px;" + << "height:" << round(h) << "px;"; + + // fix for IE + (*f_curpage) << "background-color:rgba(255,255,255,0.000001);"; + + (*f_curpage) << "\"></div>"; + + if(dest_str != "") + { + (*f_curpage) << "</a>"; + } +} + +}// namespace pdf2htmlEX diff --git a/src/HTMLRenderer/outline.cc b/src/HTMLRenderer/outline.cc new file mode 100644 index 0000000..12c3896 --- /dev/null +++ b/src/HTMLRenderer/outline.cc @@ -0,0 +1,74 @@ +/* + * outline.cc + * + * Handling Outline items + * + * by WangLu + * 2013.01.28 + */ + +#include <iostream> + +#include <Outline.h> +#include <goo/GooList.h> + +#include "HTMLRenderer.h" +#include "util/namespace.h" +#include "util/encoding.h" +#include "util/css_const.h" + +namespace pdf2htmlEX { + +using std::ostream; + +void HTMLRenderer::process_outline_items(GooList * items) +{ + if((!items) || (items->getLength() == 0)) + return; + + f_outline.fs << "<ul>"; + + for(int i = 0; i < items->getLength(); ++i) + { + OutlineItem * item = (OutlineItem*)(items->get(i)); + + string detail; + string dest = get_linkaction_str(item->getAction(), detail); + + // we don't care dest is empty or not. + f_outline.fs << "<li>" << "<a class=\"" << CSS::LINK_CN << "\" href=\""; + writeAttribute(f_outline.fs, dest); + f_outline.fs << "\""; + + if(!detail.empty()) + f_outline.fs << " data-dest-detail='" << detail << "'"; + + f_outline.fs << ">"; + + writeUnicodes(f_outline.fs, item->getTitle(), item->getTitleLength()); + + f_outline.fs << "</a>"; + + // check kids + item->open(); + if(item->hasKids()) + { + process_outline_items(item->getKids()); + } + item->close(); + f_outline.fs << "</li>"; + } + + f_outline.fs << "</ul>"; +} + +void HTMLRenderer::process_outline() +{ + Outline * outline = cur_doc->getOutline(); + if(!outline) + return; + + process_outline_items(outline->getItems()); +} + +}// namespace pdf2htmlEX diff --git a/src/HTMLRenderer/state.cc b/src/HTMLRenderer/state.cc new file mode 100644 index 0000000..f26b17f --- /dev/null +++ b/src/HTMLRenderer/state.cc @@ -0,0 +1,541 @@ +/* + * state.cc + * + * track PDF states + * + * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com> + */ + +#include <cmath> +#include <algorithm> + +#include "HTMLRenderer.h" + +#include "util/namespace.h" +#include "util/math.h" + +namespace pdf2htmlEX { + +using std::max; +using std::abs; + +void HTMLRenderer::updateAll(GfxState * state) +{ + all_changed = true; + updateTextPos(state); +} +void HTMLRenderer::updateRise(GfxState * state) +{ + rise_changed = true; +} +void HTMLRenderer::updateTextPos(GfxState * state) +{ + text_pos_changed = true; + cur_tx = state->getLineX(); + cur_ty = state->getLineY(); +} +void HTMLRenderer::updateTextShift(GfxState * state, double shift) +{ + text_pos_changed = true; + cur_tx -= shift * 0.001 * state->getFontSize() * state->getHorizScaling(); +} +void HTMLRenderer::updateFont(GfxState * state) +{ + font_changed = true; +} +void HTMLRenderer::updateCTM(GfxState * state, double m11, double m12, double m21, double m22, double m31, double m32) +{ + ctm_changed = true; + tracer.update_ctm(state, m11, m12, m21, m22, m31, m32); +} +void HTMLRenderer::updateTextMat(GfxState * state) +{ + text_mat_changed = true; +} +void HTMLRenderer::updateHorizScaling(GfxState * state) +{ + hori_scale_changed = true; +} +void HTMLRenderer::updateCharSpace(GfxState * state) +{ + letter_space_changed = true; +} +void HTMLRenderer::updateWordSpace(GfxState * state) +{ + word_space_changed = true; +} +void HTMLRenderer::updateRender(GfxState * state) +{ + // currently Render is traced for color only + // might need something like render_changed later + fill_color_changed = true; + stroke_color_changed = true; +} +void HTMLRenderer::updateFillColorSpace(GfxState * state) +{ + fill_color_changed = true; +} +void HTMLRenderer::updateStrokeColorSpace(GfxState * state) +{ + stroke_color_changed = true; +} +void HTMLRenderer::updateFillColor(GfxState * state) +{ + fill_color_changed = true; +} +void HTMLRenderer::updateStrokeColor(GfxState * state) +{ + stroke_color_changed = true; +} +void HTMLRenderer::clip(GfxState * state) +{ + clip_changed = true; + tracer.clip(state); +} +void HTMLRenderer::eoClip(GfxState * state) +{ + clip_changed = true; + tracer.clip(state, true); +} +void HTMLRenderer::clipToStrokePath(GfxState * state) +{ + clip_changed = true; + tracer.clip_to_stroke_path(state); +} +void HTMLRenderer::reset_state() +{ + draw_text_scale = 1.0; + + cur_font_size = 0.0; + + memcpy(cur_text_tm, ID_MATRIX, sizeof(cur_text_tm)); + + // reset html_state + cur_text_state.font_info = install_font(nullptr); + cur_text_state.font_size = 0; + cur_text_state.fill_color.transparent = true; + cur_text_state.stroke_color.transparent = true; + cur_text_state.letter_space = 0; + cur_text_state.word_space = 0; + cur_text_state.vertical_align = 0; + + cur_line_state.x = 0; + cur_line_state.y = 0; + memcpy(cur_line_state.transform_matrix, ID_MATRIX, sizeof(cur_line_state.transform_matrix)); + + cur_line_state.is_char_covered = [this](int index) { return is_char_covered(index);}; + + cur_clip_state.xmin = 0; + cur_clip_state.xmax = 0; + cur_clip_state.ymin = 0; + cur_clip_state.ymax = 0; + + cur_tx = cur_ty = 0; + draw_tx = draw_ty = 0; + + reset_state_change(); + all_changed = true; +} +void HTMLRenderer::reset_state_change() +{ + all_changed = false; + + rise_changed = false; + text_pos_changed = false; + + font_changed = false; + ctm_changed = false; + text_mat_changed = false; + hori_scale_changed = false; + + letter_space_changed = false; + word_space_changed = false; + + fill_color_changed = false; + stroke_color_changed = false; + + clip_changed = false; +} + +template<class NewLineState> +void set_line_state(NewLineState & cur_ls, NewLineState new_ls) +{ + if(new_ls > cur_ls) + cur_ls = new_ls; +} + +void HTMLRenderer::check_state_change(GfxState * state) +{ + // DEPENDENCY WARNING + // don't adjust the order of state checking + + new_line_state = NLS_NONE; + + if(all_changed || clip_changed) + { + HTMLClipState new_clip_state; + state->getClipBBox(&new_clip_state.xmin, &new_clip_state.ymin, &new_clip_state.xmax, &new_clip_state.ymax); + if(!(equal(cur_clip_state.xmin, new_clip_state.xmin) + && equal(cur_clip_state.xmax, new_clip_state.xmax) + && equal(cur_clip_state.ymin, new_clip_state.ymin) + && equal(cur_clip_state.ymax, new_clip_state.ymax))) + { + cur_clip_state = new_clip_state; + set_line_state(new_line_state, NLS_NEWCLIP); + } + } + + bool need_recheck_position = false; + bool need_rescale_font = false; + bool draw_text_scale_changed = false; + + // save current info for later use + auto old_text_state = cur_text_state; + auto old_line_state = cur_line_state; + double old_tm[6]; + memcpy(old_tm, cur_text_tm, sizeof(old_tm)); + double old_draw_text_scale = draw_text_scale; + + // text position + // we've been tracking the text position positively in the update*** functions + if(all_changed || text_pos_changed) + { + need_recheck_position = true; + } + + // font name & size + if(all_changed || font_changed) + { + const FontInfo * new_font_info = install_font(state->getFont()); + + if(!(new_font_info->id == cur_text_state.font_info->id)) + { + // The width of the type 3 font text, if shown, is likely to be wrong + // So we will create separate (absolute positioned) blocks for them, such that it won't affect other text + if((new_font_info->is_type3 || cur_text_state.font_info->is_type3) && (!param.process_type3)) + { + set_line_state(new_line_state, NLS_NEWLINE); + } + else + { + set_line_state(new_line_state, NLS_NEWSTATE); + } + cur_text_state.font_info = new_font_info; + } + + /* + * For Type 3 fonts, we need to take type3_font_size_scale into consideration + */ + if((new_font_info->is_type3 || cur_text_state.font_info->is_type3) && param.process_type3) + need_rescale_font = true; + + double new_font_size = state->getFontSize(); + if(!equal(cur_font_size, new_font_size)) + { + need_rescale_font = true; + cur_font_size = new_font_size; + } + } + + // ctm & text ctm & hori scale & rise + if(all_changed || ctm_changed || text_mat_changed || hori_scale_changed || rise_changed) + { + double new_text_tm[6]; + + double m1[6]; + double m2[6]; + + //the matrix with horizontal_scale and rise + m1[0] = state->getHorizScaling(); + m1[3] = 1; + m1[5] = state->getRise(); + m1[1] = m1[2] = m1[4] = 0; + + tm_multiply(m2, state->getCTM(), state->getTextMat()); + tm_multiply(new_text_tm, m2, m1); + + if(!tm_equal(new_text_tm, cur_text_tm)) + { + need_recheck_position = true; + need_rescale_font = true; + memcpy(cur_text_tm, new_text_tm, sizeof(cur_text_tm)); + } + } + + // draw_text_tm, draw_text_scale + // depends: font size & ctm & text_ctm & hori scale & rise + if(need_rescale_font) + { + /* + * Rescale the font + * If the font-size is 1, and the matrix is [10,0,0,10,0,0], we would like to change it to + * font-size == 10 and matrix == [1,0,0,1,0,0], + * such that it will be easy and natural for web browsers + */ + double new_draw_text_tm[6]; + memcpy(new_draw_text_tm, cur_text_tm, sizeof(new_draw_text_tm)); + + // see how the tm (together with text_scale_factor2) would change the vector (0,1) + double new_draw_text_scale = 1.0/text_scale_factor2 * hypot(new_draw_text_tm[2], new_draw_text_tm[3]); + + double new_draw_font_size = cur_font_size; + + if(is_positive(new_draw_text_scale)) + { + // scale both font size and matrix + new_draw_font_size *= new_draw_text_scale; + for(int i = 0; i < 4; ++i) + new_draw_text_tm[i] /= new_draw_text_scale; + } + else + { + new_draw_text_scale = 1.0; + } + + if(is_positive(-new_draw_font_size)) + { + // CSS cannot handle flipped pages + new_draw_font_size *= -1; + + for(int i = 0; i < 4; ++i) + new_draw_text_tm[i] *= -1; + } + + if(!(equal(new_draw_text_scale, draw_text_scale))) + { + draw_text_scale_changed = true; + draw_text_scale = new_draw_text_scale; + } + + if(!equal(new_draw_font_size, cur_text_state.font_size)) + { + set_line_state(new_line_state, NLS_NEWSTATE); + cur_text_state.font_size = new_draw_font_size; + } + + if(!tm_equal(new_draw_text_tm, cur_line_state.transform_matrix, 4)) + { + set_line_state(new_line_state, NLS_NEWLINE); + memcpy(cur_line_state.transform_matrix, new_draw_text_tm, sizeof(cur_line_state.transform_matrix)); + } + } + + // see if the new line is compatible with the current line with proper position shift + // don't bother doing the heavy job when (new_line_state == NLS_NEWLINE) + // depends: text position & transformation + if(need_recheck_position && (new_line_state < NLS_NEWLINE)) + { + // TM[4] and/or TM[5] have been changed + // To find an offset (dx,dy), which would cancel the effect + /* + * CurTM * (cur_tx, cur_ty, 1)^T = OldTM * (draw_tx + dx, draw_ty + dy, 1)^T + * + * the first 4 elements of CurTM and OldTM should be the proportional + * otherwise the following text cannot be parallel + * + * NOTE: + * dx,dy are handled by the old state. so they should be multiplied by old_draw_text_scale + */ + + bool merged = false; + double dx = 0; + double dy = 0; + if(tm_equal(old_line_state.transform_matrix, cur_line_state.transform_matrix, 4)) + { + double det = old_tm[0] * old_tm[3] - old_tm[1] * old_tm[2]; + if(!equal(det, 0)) + { + double lhs1 = cur_text_tm[0] * cur_tx + cur_text_tm[2] * cur_ty + cur_text_tm[4] - old_tm[0] * draw_tx - old_tm[2] * draw_ty - old_tm[4]; + double lhs2 = cur_text_tm[1] * cur_tx + cur_text_tm[3] * cur_ty + cur_text_tm[5] - old_tm[1] * draw_tx - old_tm[3] * draw_ty - old_tm[5]; + /* + * Now the equation system becomes + * + * lhs1 = OldTM[0] * dx + OldTM[2] * dy + * lhs2 = OldTM[1] * dx + OldTM[3] * dy + */ + + double inverted[4]; + inverted[0] = old_tm[3] / det; + inverted[1] = -old_tm[1] / det; + inverted[2] = -old_tm[2] / det; + inverted[3] = old_tm[0] / det; + dx = inverted[0] * lhs1 + inverted[2] * lhs2; + dy = inverted[1] * lhs1 + inverted[3] * lhs2; + if(equal(dy, 0)) + { + // text on a same horizontal line, we can insert positive or negative x-offsets + merged = true; + } + else if(param.optimize_text) + { + // otherwise we merge the lines only when + // - text are not shifted to the left too much + // - text are not moved too high or too low + if((dx * old_draw_text_scale) >= -param.space_threshold * old_text_state.em_size() - EPS) + { + double oldymin = old_text_state.font_info->descent * old_text_state.font_size; + double oldymax = old_text_state.font_info->ascent * old_text_state.font_size; + double ymin = dy * old_draw_text_scale + cur_text_state.font_info->descent * cur_text_state.font_size; + double ymax = dy * old_draw_text_scale + cur_text_state.font_info->ascent * cur_text_state.font_size; + if((ymin <= oldymax + EPS) && (ymax >= oldymin - EPS)) + { + merged = true; + } + } + } + } + //else no solution + } + // else: different rotation: force new line + + if(merged && !equal(state->getHorizScaling(), 0)) + { + html_text_page.get_cur_line()->append_offset(dx * old_draw_text_scale / state->getHorizScaling()); + if(equal(dy, 0)) + { + cur_text_state.vertical_align = 0; + } + else + { + cur_text_state.vertical_align = (dy * old_draw_text_scale); + set_line_state(new_line_state, NLS_NEWSTATE); + } + draw_tx = cur_tx; + draw_ty = cur_ty; + } + else + { + set_line_state(new_line_state, NLS_NEWLINE); + } + } + else + { + // no vertical shift if no need to check position + cur_text_state.vertical_align = 0; + } + + // letter space + // depends: draw_text_scale + if(all_changed || letter_space_changed || draw_text_scale_changed) + { + double new_letter_space = state->getCharSpace() * draw_text_scale; + if(!equal(new_letter_space, cur_text_state.letter_space)) + { + cur_text_state.letter_space = new_letter_space; + set_line_state(new_line_state, NLS_NEWSTATE); + } + } + + // word space + // depends draw_text_scale + if(all_changed || word_space_changed || draw_text_scale_changed) + { + double new_word_space = state->getWordSpace() * draw_text_scale; + if(!equal(new_word_space, cur_text_state.word_space)) + { + cur_text_state.word_space = new_word_space; + set_line_state(new_line_state, NLS_NEWSTATE); + } + } + + // fill color + if((!(param.fallback)) && (all_changed || fill_color_changed)) + { + // * PDF Spec. Table 106 –Text rendering modes + static const char FILL[8] = { true, false, true, false, true, false, true, false }; + + int idx = state->getRender(); + assert((idx >= 0) && (idx < 8)); + Color new_fill_color; + if(FILL[idx]) + { + new_fill_color.transparent = false; + state->getFillRGB(&new_fill_color.rgb); + } + else + { + new_fill_color.transparent = true; + } + if(!(new_fill_color == cur_text_state.fill_color)) + { + cur_text_state.fill_color = new_fill_color; + set_line_state(new_line_state, NLS_NEWSTATE); + } + } + + // stroke color + if((!(param.fallback)) && (all_changed || stroke_color_changed)) + { + // * PDF Spec. Table 106 – Text rendering modes + static const char STROKE[8] = { false, true, true, false, false, true, true, false }; + + int idx = state->getRender(); + assert((idx >= 0) && (idx < 8)); + Color new_stroke_color; + // stroke + if(STROKE[idx]) + { + new_stroke_color.transparent = false; + state->getStrokeRGB(&new_stroke_color.rgb); + } + else + { + new_stroke_color.transparent = true; + } + if(!(new_stroke_color == cur_text_state.stroke_color)) + { + cur_text_state.stroke_color = new_stroke_color; + set_line_state(new_line_state, NLS_NEWSTATE); + } + } + + reset_state_change(); +} + +void HTMLRenderer::prepare_text_line(GfxState * state) +{ + if(!(html_text_page.get_cur_line())) + new_line_state = NLS_NEWCLIP; + + if(new_line_state >= NLS_NEWCLIP) + { + html_text_page.clip(cur_clip_state); + } + + if(new_line_state >= NLS_NEWLINE) + { + // update position such that they will be recorded by text_line_buf + double rise_x, rise_y; + state->textTransformDelta(0, state->getRise(), &rise_x, &rise_y); + state->transform(state->getCurX() + rise_x, state->getCurY() + rise_y, &cur_line_state.x, &cur_line_state.y); + + if (param.correct_text_visibility) + cur_line_state.first_char_index = get_char_count(); + + html_text_page.open_new_line(cur_line_state); + + cur_text_state.vertical_align = 0; + + //resync position + draw_ty = cur_ty; + draw_tx = cur_tx; + } + else + { + // align horizontal position + // try to merge with the last line if possible + double target = (cur_tx - draw_tx) * draw_text_scale; + if(!equal(target, 0)) + { + html_text_page.get_cur_line()->append_offset(target); + draw_tx += target / draw_text_scale; + } + } + + if(new_line_state != NLS_NONE) + { + html_text_page.get_cur_line()->append_state(cur_text_state); + } +} + +} //namespace pdf2htmlEX diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc new file mode 100644 index 0000000..e58a17a --- /dev/null +++ b/src/HTMLRenderer/text.cc @@ -0,0 +1,166 @@ +/* + * text.cc + * + * Handling text & font, and relative stuffs + * + * Copyright (C) 2012 Lu Wang <coolwanglu@gmail.com> + */ + + +#include <algorithm> + +#include "HTMLRenderer.h" + +#include "util/namespace.h" +#include "util/unicode.h" + +//#define HR_DEBUG(x) (x) +#define HR_DEBUG(x) + +namespace pdf2htmlEX { + +using std::none_of; +using std::cerr; +using std::endl; + +void HTMLRenderer::drawString(GfxState * state, GooString * s) +{ + if(s->getLength() == 0) + return; + + auto font = state->getFont(); + double cur_letter_space = state->getCharSpace(); + double cur_word_space = state->getWordSpace(); + double cur_horiz_scaling = state->getHorizScaling(); + + + // Writing mode fonts and Type 3 fonts are rendered as images + // I don't find a way to display writing mode fonts in HTML except for one div for each character, which is too costly + // For type 3 fonts, due to the font matrix, still it's hard to show it on HTML + if( (font == nullptr) + || (font->getWMode()) + || ((font->getType() == fontType3) && (!param.process_type3)) + ) + { + return; + } + + // see if the line has to be closed due to state change + check_state_change(state); + prepare_text_line(state); + + // Now ready to output + // get the unicodes + char *p = s->getCString(); + int len = s->getLength(); + + //accumulated displacement of chars in this string, in text object space + double dx = 0; + double dy = 0; + //displacement of current char, in text object space, including letter space but not word space. + double ddx, ddy; + //advance of current char, in glyph space + double ax, ay; + //origin of current char, in glyph space + double ox, oy; + + int uLen; + + CharCode code; + Unicode *u = nullptr; + + HR_DEBUG(printf("HTMLRenderer::drawString:len=%d\n", len)); + + while (len > 0) + { + auto n = font->getNextChar(p, len, &code, &u, &uLen, &ax, &ay, &ox, &oy); + HR_DEBUG(printf("HTMLRenderer::drawString:unicode=%lc(%d)\n", (wchar_t)u[0], u[0])); + + if(!(equal(ox, 0) && equal(oy, 0))) + { + cerr << "TODO: non-zero origins" << endl; + } + ddx = ax * cur_font_size + cur_letter_space; + ddy = ay * cur_font_size; + tracer.draw_char(state, dx, dy, ax, ay); + + bool is_space = false; + if (n == 1 && *p == ' ') + { + /* + * This is by standard + * however some PDF will use ' ' as a normal encoding slot + * such that it will be mapped to other unicodes + * In that case, when space_as_offset is on, we will simply ignore that character... + * + * Checking mapped unicode may or may not work + * There are always ugly PDF files with no useful info at all. + */ + is_space = true; + } + + if(is_space && (param.space_as_offset)) + { + html_text_page.get_cur_line()->append_padding_char(); + // ignore horiz_scaling, as it has been merged into CTM + html_text_page.get_cur_line()->append_offset((ax * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale); + } + else + { + if((param.decompose_ligature) && (uLen > 1) && none_of(u, u+uLen, is_illegal_unicode)) + { + html_text_page.get_cur_line()->append_unicodes(u, uLen, ddx); + } + else + { + Unicode uu; + if(cur_text_state.font_info->use_tounicode) + { + uu = check_unicode(u, uLen, code, font); + } + else + { + uu = unicode_from_font(code, font); + } + html_text_page.get_cur_line()->append_unicodes(&uu, 1, ddx); + /* + * In PDF, word_space is appended if (n == 1 and *p = ' ') + * but in HTML, word_space is appended if (uu == ' ') + */ + int space_count = (is_space ? 1 : 0) - ((uu == ' ') ? 1 : 0); + if(space_count != 0) + { + html_text_page.get_cur_line()->append_offset(cur_word_space * draw_text_scale * space_count); + } + } + } + + dx += ddx * cur_horiz_scaling; + dy += ddy; + if (is_space) + dx += cur_word_space * cur_horiz_scaling; + + p += n; + len -= n; + } + + cur_tx += dx; + cur_ty += dy; + + draw_tx += dx; + draw_ty += dy; +} + +bool HTMLRenderer::is_char_covered(int index) +{ + auto covered = covered_text_detector.get_chars_covered(); + if (index < 0 || index >= (int)covered.size()) + { + std::cerr << "Warning: HTMLRenderer::is_char_covered: index out of bound: " + << index << ", size: " << covered.size() <<endl; + return false; + } + return covered[index]; +} + +} // namespace pdf2htmlEX diff --git a/src/HTMLState.h b/src/HTMLState.h new file mode 100644 index 0000000..ef7e29f --- /dev/null +++ b/src/HTMLState.h @@ -0,0 +1,82 @@ +/* + * Header file for HTMLState + * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com> + */ +#ifndef HTMLSTATE_H__ +#define HTMLSTATE_H__ + +#include <functional> + +#include "Color.h" + +namespace pdf2htmlEX { + +struct FontInfo +{ + long long id; + bool use_tounicode; + int em_size; + double space_width; + double ascent, descent; + bool is_type3; + /* + * As Type 3 fonts have a font matrix + * a glyph of 1pt can be very large or very small + * however it might not be true for other font formats such as ttf + * + * Therefore when we save a Type 3 font into ttf, + * we have to scale the font to about 1, + * then apply the scaling when using the font + * + * The scaling factor is stored as font_size_scale + * + * The value is 1 for other fonts + */ + double font_size_scale; +}; + +struct HTMLTextState +{ + const FontInfo * font_info; + double font_size; + Color fill_color; + Color stroke_color; + double letter_space; + double word_space; + + // relative to the previous state + double vertical_align; + + // the offset cause by a single ' ' char + double single_space_offset(void) const { + double offset = word_space + letter_space; + if(font_info->em_size != 0) + offset += font_info->space_width * font_size; + return offset; + } + // calculate em_size of this state + double em_size(void) const { + return font_size * (font_info->ascent - font_info->descent); + } +}; + +struct HTMLLineState +{ + double x,y; + double transform_matrix[4]; + // The page-cope char index(in drawing order) of the first char in this line. + int first_char_index; + // A function to determine whether a char is covered at a given index. + std::function<bool(int)> is_char_covered; + + HTMLLineState(): first_char_index(-1) { } +}; + +struct HTMLClipState +{ + double xmin, xmax, ymin, ymax; +}; + +} // namespace pdf2htmlEX + +#endif //HTMLSTATE_H__ diff --git a/src/HTMLTextLine.cc b/src/HTMLTextLine.cc new file mode 100644 index 0000000..a0be286 --- /dev/null +++ b/src/HTMLTextLine.cc @@ -0,0 +1,734 @@ +/* + * HTMLTextLine.cc + * + * Generate and optimized HTML for one line + * + * Copyright (C) 2012,2013 Lu Wang <coolwanglu@gmail.com> + */ + +#include <cmath> +#include <algorithm> + +#include "HTMLTextLine.h" + +#include "util/encoding.h" +#include "util/css_const.h" + +namespace pdf2htmlEX { + +using std::min; +using std::max; +using std::vector; +using std::ostream; +using std::cerr; +using std::endl; +using std::find; +using std::abs; + +HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & param, AllStateManager & all_manager) + :param(param) + ,all_manager(all_manager) + ,line_state(line_state) + ,clip_x1(0) + ,clip_y1(0) + ,width(0) +{ } + +void HTMLTextLine::append_unicodes(const Unicode * u, int l, double width) +{ + if (l == 1) + text.push_back(min(u[0], (unsigned)INT_MAX)); + else if (l > 1) + { + text.push_back(- decomposed_text.size() - 1); + decomposed_text.emplace_back(); + decomposed_text.back().assign(u, u + l); + } + this->width += width; +} + +void HTMLTextLine::append_offset(double width) +{ + /* + * If the last offset is very thin, we can ignore it and directly use it + * But this should not happen often, and we will also filter near-zero offsets when outputting them + * So don't check it. + * + * Offset must be appended immediately after the last real (non-padding) char, or the text optimizing + * algorithm may be confused: it may wrongly convert offsets at the beginning of a line to word-space. + */ + + auto offset_idx = text.size(); + while (offset_idx > 0 && text[offset_idx - 1] == 0) + --offset_idx; + if((!offsets.empty()) && (offsets.back().start_idx == offset_idx)) + offsets.back().width += width; + else + offsets.emplace_back(offset_idx, width); + this->width += width; +} + +void HTMLTextLine::append_state(const HTMLTextState & text_state) +{ + if(states.empty() || (states.back().start_idx != text.size())) + { + states.emplace_back(); + states.back().start_idx = text.size(); + states.back().hash_umask = 0; + } + + HTMLTextState & last_state = states.back(); + last_state = text_state; + //apply font scale + last_state.font_size *= last_state.font_info->font_size_scale; +} + +void HTMLTextLine::dump_char(std::ostream & out, int pos) +{ + int c = text[pos]; + if (c > 0) + { + Unicode u = c; + writeUnicodes(out, &u, 1); + } + else if (c < 0) + { + auto dt = decomposed_text[- c - 1]; + writeUnicodes(out, &dt.front(), dt.size()); + } +} + +void HTMLTextLine::dump_chars(ostream & out, int begin, int len) +{ + static const Color transparent(0, 0, 0, true); + + if (line_state.first_char_index < 0) + { + for (int i = 0; i < len; i++) + dump_char(out, begin + i); + return; + } + + bool invisible_group_open = false; + for(int i = 0; i < len; i++) + { + if (!line_state.is_char_covered(line_state.first_char_index + begin + i)) //visible + { + if (invisible_group_open) + { + invisible_group_open = false; + out << "</span>"; + } + dump_char(out, begin + i); + } + else + { + if (!invisible_group_open) + { + out << "<span class=\"" << all_manager.fill_color.get_css_class_name() + << all_manager.fill_color.install(transparent) << " " << all_manager.stroke_color.get_css_class_name() + << all_manager.stroke_color.install(transparent) << "\">"; + invisible_group_open = true; + } + dump_char(out, begin + i); + } + } + if (invisible_group_open) + out << "</span>"; +} + +void HTMLTextLine::dump_text(ostream & out) +{ + /* + * Each Line is an independent absolute positioned block + * so even we have a few states or offsets, we may omit them + */ + if(text.empty()) + return; + + if(states.empty() || (states[0].start_idx != 0)) + { + cerr << "Warning: text without a style! Must be a bug in pdf2htmlEX" << endl; + return; + } + + // Start Output + { + // open <div> for the current text line + out << "<div class=\"" << CSS::LINE_CN + << " " << CSS::TRANSFORM_MATRIX_CN << all_manager.transform_matrix.install(line_state.transform_matrix) + << " " << CSS::LEFT_CN << all_manager.left.install(line_state.x - clip_x1) + << " " << CSS::HEIGHT_CN << all_manager.height.install(ascent) + << " " << CSS::BOTTOM_CN << all_manager.bottom.install(line_state.y - clip_y1) + ; + // it will be closed by the first state + } + + std::vector<State*> stack; + // a special safeguard in the bottom + stack.push_back(nullptr); + + //accumulated horizontal offset; + double dx = 0; + + // whenever a negative offset appears, we should not pop out that <span> + // otherwise the effect of negative margin-left would disappear + size_t last_text_pos_with_negative_offset = 0; + size_t cur_text_idx = 0; + + auto cur_offset_iter = offsets.begin(); + for(auto state_iter2 = states.begin(), state_iter1 = state_iter2++; + state_iter1 != states.end(); + ++state_iter1, ++state_iter2) + { + // export current state, find a closest parent + { + // greedy + double vertical_align = state_iter1->vertical_align; + int best_cost = State::HASH_ID_COUNT + 1; + // we have a nullptr at the beginning, so no need to check for rend + for(auto iter = stack.rbegin(); *iter; ++iter) + { + int cost = state_iter1->diff(**iter); + if(!equal(vertical_align,0)) + ++cost; + + if(cost < best_cost) + { + while(stack.back() != *iter) + { + stack.back()->end(out); + stack.pop_back(); + } + best_cost = cost; + state_iter1->vertical_align = vertical_align; + + if(best_cost == 0) + break; + } + + // cannot go further + if((*iter)->start_idx <= last_text_pos_with_negative_offset) + break; + + vertical_align += (*iter)->vertical_align; + } + // + state_iter1->ids[State::VERTICAL_ALIGN_ID] = all_manager.vertical_align.install(state_iter1->vertical_align); + // export the diff between *state_iter1 and stack.back() + state_iter1->begin(out, stack.back()); + stack.push_back(&*state_iter1); + } + + // [state_iter1->start_idx, text_idx2) are covered by the current state + size_t text_idx2 = (state_iter2 == states.end()) ? text.size() : state_iter2->start_idx; + + // dump all text and offsets before next state + while(true) + { + if((cur_offset_iter != offsets.end()) + && (cur_offset_iter->start_idx <= cur_text_idx)) + { + if(cur_offset_iter->start_idx > text_idx2) + break; + // next is offset + double target = cur_offset_iter->width + dx; + double actual_offset = 0; + + //ignore near-zero offsets + if(std::abs(target) <= param.h_eps) + { + actual_offset = 0; + } + else + { + bool done = false; + // check if the offset is equivalent to a single ' ' + if(!(state_iter1->hash_umask & State::umask_by_id(State::WORD_SPACE_ID))) + { + double space_off = state_iter1->single_space_offset(); + if(std::abs(target - space_off) <= param.h_eps) + { + Unicode u = ' '; + writeUnicodes(out, &u, 1); + actual_offset = space_off; + done = true; + } + } + + // finally, just dump it + if(!done) + { + long long wid = all_manager.whitespace.install(target, &actual_offset); + + if(!equal(actual_offset, 0)) + { + if(is_positive(-actual_offset)) + last_text_pos_with_negative_offset = cur_text_idx; + + double threshold = state_iter1->em_size() * (param.space_threshold); + + out << "<span class=\"" << CSS::WHITESPACE_CN + << ' ' << CSS::WHITESPACE_CN << wid << "\">" << (target > (threshold - EPS) ? " " : "") << "</span>"; + } + } + } + dx = target - actual_offset; + ++ cur_offset_iter; + } + else + { + if(cur_text_idx >= text_idx2) + break; + // next is text + size_t next_text_idx = text_idx2; + if((cur_offset_iter != offsets.end()) && (cur_offset_iter->start_idx) < next_text_idx) + next_text_idx = cur_offset_iter->start_idx; + dump_chars(out, cur_text_idx, next_text_idx - cur_text_idx); + cur_text_idx = next_text_idx; + } + } + } + + // we have a nullptr in the bottom + while(stack.back()) + { + stack.back()->end(out); + stack.pop_back(); + } + + out << "</div>"; +} + +void HTMLTextLine::clear(void) +{ + states.clear(); + offsets.clear(); + text.clear(); +} + +void HTMLTextLine::clip(const HTMLClipState & clip_state) +{ + clip_x1 = clip_state.xmin; + clip_y1 = clip_state.ymin; +} + +void HTMLTextLine::prepare(void) +{ + // max_ascent determines the height of the div + double accum_vertical_align = 0; // accumulated + ascent = 0; + descent = 0; + // note that vertical_align cannot be calculated here + for(auto iter = states.begin(); iter != states.end(); ++iter) + { + auto font_info = iter->font_info; + iter->ids[State::FONT_ID] = font_info->id; + iter->ids[State::FONT_SIZE_ID] = all_manager.font_size.install(iter->font_size); + iter->ids[State::FILL_COLOR_ID] = all_manager.fill_color.install(iter->fill_color); + iter->ids[State::STROKE_COLOR_ID] = all_manager.stroke_color.install(iter->stroke_color); + iter->ids[State::LETTER_SPACE_ID] = all_manager.letter_space.install(iter->letter_space); + iter->ids[State::WORD_SPACE_ID] = all_manager.word_space.install(iter->word_space); + iter->hash(); + + accum_vertical_align += iter->vertical_align; + double cur_ascent = accum_vertical_align + font_info->ascent * iter->font_size; + if(cur_ascent > ascent) + ascent = cur_ascent; + double cur_descent = accum_vertical_align + font_info->descent * iter->font_size; + if(cur_descent < descent) + descent = cur_descent; + } +} + + +void HTMLTextLine::optimize(std::vector<HTMLTextLine*> & lines) +{ + if(param.optimize_text == 3) + { + optimize_aggressive(lines); + } + else + { + optimize_normal(lines); + } +} +/* + * Adjust letter space and word space in order to reduce the number of HTML elements + * May also unmask word space + */ +void HTMLTextLine::optimize_normal(std::vector<HTMLTextLine*> & lines) +{ + // remove useless states in the end + while((!states.empty()) && (states.back().start_idx >= text.size())) + states.pop_back(); + + assert(!states.empty()); + + const long long word_space_umask = State::umask_by_id(State::WORD_SPACE_ID); + + // for optimization, we need accurate values + auto & ls_manager = all_manager.letter_space; + auto & ws_manager = all_manager.word_space; + + // statistics of widths + std::map<double, size_t> width_map; + // store optimized offsets + std::vector<Offset> new_offsets; + new_offsets.reserve(offsets.size()); + + auto offset_iter1 = offsets.begin(); + for(auto state_iter2 = states.begin(), state_iter1 = state_iter2++; + state_iter1 != states.end(); + ++state_iter1, ++state_iter2) + { + const size_t text_idx1 = state_iter1->start_idx; + const size_t text_idx2 = (state_iter2 == states.end()) ? text.size() : state_iter2->start_idx; + size_t text_count = text_idx2 - text_idx1; + + // there might be some offsets before the first state + while((offset_iter1 != offsets.end()) + && (offset_iter1->start_idx <= text_idx1)) + { + new_offsets.push_back(*(offset_iter1++)); + } + + // find the last offset covered by the current state + auto offset_iter2 = offset_iter1; + for(; (offset_iter2 != offsets.end()) && (offset_iter2->start_idx <= text_idx2); ++offset_iter2) { } + + // There are `offset_count` <span>'s, the target is to reduce this number + size_t offset_count = offset_iter2 - offset_iter1; + assert(text_count >= offset_count); + + // Optimize letter space + // how much letter_space is changed + // will be later used for optimizing word space + double letter_space_diff = 0; + width_map.clear(); + + // In some PDF files all letter spaces are implemented as position shifts between each letter + // try to simplify it with a proper letter space + if(offset_count > 0) + { + // mark the current letter_space + if(text_count > offset_count) + width_map.insert(std::make_pair(0, text_count - offset_count)); + + for(auto off_iter = offset_iter1; off_iter != offset_iter2; ++off_iter) + { + const double target = off_iter->width; + auto iter = width_map.lower_bound(target-EPS); + if((iter != width_map.end()) && (std::abs(iter->first - target) <= EPS)) + { + ++ iter->second; + } + else + { + width_map.insert(iter, std::make_pair(target, 1)); + } + } + + // TODO snapping the widths may result a better result + // e.g. for (-0.7 0.6 -0.2 0.3 10 10), 0 is better than 10 + double most_used_width = 0; + size_t max_count = 0; + for(auto iter = width_map.begin(); iter != width_map.end(); ++iter) + { + if(iter->second > max_count) + { + most_used_width = iter->first; + max_count = iter->second; + } + } + + // negative letter space may cause problems + if((max_count <= text_count / 2) || (!is_positive(state_iter1->letter_space + most_used_width))) + { + // the old value is the best + // just copy old offsets + new_offsets.insert(new_offsets.end(), offset_iter1, offset_iter2); + } + else + { + // now we would like to adjust letter space to most_used width + + // install new letter space + const double old_ls = state_iter1->letter_space; + state_iter1->ids[State::LETTER_SPACE_ID] = ls_manager.install(old_ls + most_used_width, &(state_iter1->letter_space)); + letter_space_diff = old_ls - state_iter1->letter_space; + // update offsets + auto off_iter = offset_iter1; + // re-count number of offsets + offset_count = 0; + for(size_t cur_text_idx = text_idx1; cur_text_idx < text_idx2; ++cur_text_idx) + { + double cur_width = 0; + if((off_iter != offset_iter2) && (off_iter->start_idx == cur_text_idx + 1)) + { + cur_width = off_iter->width + letter_space_diff; + ++off_iter; + } + else + { + cur_width = letter_space_diff ; + } + if(!equal(cur_width, 0)) + { + new_offsets.emplace_back(cur_text_idx+1, cur_width); + ++ offset_count; + } + } + } + } + + // Optimize word space + + // In some PDF files all spaces are converted into positioning shift + // We may try to change (some of) them to ' ' by adjusting word_space + // for now, we consider only the no-space scenario + // which also includes the case when param.space_as_offset is set + + // get the text segment covered by current state (*state_iter1) + const auto text_iter1 = text.begin() + text_idx1; + const auto text_iter2 = text.begin() + text_idx2; + if(find(text_iter1, text_iter2, ' ') == text_iter2) + { + // if there is not any space, we may change the value of word_space arbitrarily + // note that we may only change word space, no offset will be affected + // The actual effect will emerge during flushing, where it could be detected that an offset can be optimized as a single space character + + if(offset_count > 0) + { + double threshold = (state_iter1->em_size()) * (param.space_threshold); + // set word_space for the most frequently used offset + double most_used_width = 0; + size_t max_count = 0; + + // if offset_count > 0, we must have updated width_map in the previous step + // find the most frequent width, with new letter space applied + for(auto iter = width_map.begin(); iter != width_map.end(); ++iter) + { + double fixed_width = iter->first + letter_space_diff; // this is the actual offset in HTML + // we don't want to add spaces for tiny gaps, or even negative shifts + if((fixed_width >= threshold - EPS) && (iter->second > max_count)) + { + max_count = iter->second; + most_used_width = fixed_width; + } + } + + state_iter1->word_space = 0; // clear word_space for single_space_offset + double new_word_space = most_used_width - state_iter1->single_space_offset(); + state_iter1->ids[State::WORD_SPACE_ID] = ws_manager.install(new_word_space, &(state_iter1->word_space)); // install new word_space + state_iter1->hash_umask &= (~word_space_umask); // mark that the word_space is not free + } + else // there is no offset at all + { + state_iter1->hash_umask |= word_space_umask; // we just free word_space + } + } + offset_iter1 = offset_iter2; + } + + // apply optimization + std::swap(offsets, new_offsets); + + lines.push_back(this); +} + +// for optimize-text == 3 +void HTMLTextLine::optimize_aggressive(std::vector<HTMLTextLine*> & lines) +{ + /* + HTMLLineState original_line_state = line_state; + // break the line if there are a large (positive or negative) shift + // letter space / word space are not taken into consideration (yet) + while(true) + { + } + + // aggressive optimization + if(target > state_iter1->em_size() * (param.space_threshold) - EPS) + out << ' '; + dx = 0; + lines.push_back(this); + */ +} + +// this state will be converted to a child node of the node of prev_state +// dump the difference between previous state +// also clone corresponding states +void HTMLTextLine::State::begin (ostream & out, const State * prev_state) +{ + if(prev_state) + { + long long cur_mask = 0xff; + bool first = true; + for(int i = 0; i < HASH_ID_COUNT; ++i, cur_mask<<=8) + { + if(hash_umask & cur_mask) // we don't care about this ID + { + if (prev_state->hash_umask & cur_mask) // if prev_state do not care about it either + continue; + + // otherwise + // we have to inherit it + ids[i] = prev_state->ids[i]; + hash_umask &= (~cur_mask); + //copy the corresponding value + //TODO: this is so ugly + switch(i) + { + case FONT_SIZE_ID: + font_size = prev_state->font_size; + break; + case LETTER_SPACE_ID: + letter_space = prev_state->letter_space; + break; + case WORD_SPACE_ID: + word_space = prev_state->word_space; + break; + default: + cerr << "unexpected state mask" << endl; + break; + } + } + + // now we care about the ID + + // if the value from prev_state is the same, we don't need to dump it + if((!(prev_state->hash_umask & cur_mask)) && (prev_state->ids[i] == ids[i])) + continue; + + // so we have to dump it + if(first) + { + out << "<span class=\""; + first = false; + } + else + { + out << ' '; + } + + // out should have hex set + out << css_class_names[i]; + if (ids[i] == -1) + out << CSS::INVALID_ID; + else + out << ids[i]; + } + // vertical align + if(!equal(vertical_align, 0)) + { + // so we have to dump it + if(first) + { + out << "<span class=\""; + first = false; + } + else + { + out << ' '; + } + + // out should have hex set + out << CSS::VERTICAL_ALIGN_CN; + auto id = ids[VERTICAL_ALIGN_ID]; + if (id == -1) + out << CSS::INVALID_ID; + else + out << id; + } + + if(first) // we actually just inherit the whole prev_state + { + need_close = false; + } + else + { + out << "\">"; + need_close = true; + } + } + else + { + // prev_state == nullptr + // which means this is the first state of the line + // there should be a open pending <div> left there + // it is not necessary to output vertical align + long long cur_mask = 0xff; + for(int i = 0; i < HASH_ID_COUNT; ++i, cur_mask<<=8) + { + if(hash_umask & cur_mask) // we don't care about this ID + continue; + + // now we care about the ID + out << ' '; + // out should have hex set + out << css_class_names[i]; + if (ids[i] == -1) + out << CSS::INVALID_ID; + else + out << ids[i]; + } + + out << "\">"; + need_close = false; + } +} + +void HTMLTextLine::State::end(ostream & out) const +{ + if(need_close) + out << "</span>"; +} + +void HTMLTextLine::State::hash(void) +{ + hash_value = 0; + for(int i = 0; i < ID_COUNT; ++i) + { + hash_value = (hash_value << 8) | (ids[i] & 0xff); + } +} + +int HTMLTextLine::State::diff(const State & s) const +{ + /* + * A quick check based on hash_value + * it could be wrong when there are more then 256 classes, + * in which case the output may not be optimal, but still 'correct' in terms of HTML + */ + long long common_mask = ~(hash_umask | s.hash_umask); + if((hash_value & common_mask) == (s.hash_value & common_mask)) return 0; + + long long cur_mask = 0xff; + int d = 0; + for(int i = 0; i < ID_COUNT; ++i) + { + if((common_mask & cur_mask) && (ids[i] != s.ids[i])) + ++ d; + cur_mask <<= 8; + } + return d; +} + +long long HTMLTextLine::State::umask_by_id(int id) +{ + return (((long long)0xff) << (8*id)); +} + +// the order should be the same as in the enum +const char * const HTMLTextLine::State::css_class_names [] = { + CSS::FONT_FAMILY_CN, + CSS::FONT_SIZE_CN, + CSS::FILL_COLOR_CN, + CSS::STROKE_COLOR_CN, + CSS::LETTER_SPACE_CN, + CSS::WORD_SPACE_CN, + CSS::VERTICAL_ALIGN_CN, +}; + +} //namespace pdf2htmlEX diff --git a/src/HTMLTextLine.h b/src/HTMLTextLine.h new file mode 100644 index 0000000..fcce811 --- /dev/null +++ b/src/HTMLTextLine.h @@ -0,0 +1,134 @@ +/* + * Header file for HTMLTextLine + * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com> + */ +#ifndef HTMLTEXTLINE_H__ +#define HTMLTEXTLINE_H__ + +#include <ostream> +#include <vector> + +#include <CharTypes.h> + +#include "Param.h" +#include "StateManager.h" +#include "HTMLState.h" + +namespace pdf2htmlEX { + +/* + * Store and optimize a line of text in HTML + * + * contains a series of + * - Text + * - Shift + * - State change + */ +class HTMLTextLine +{ +public: + HTMLTextLine (const HTMLLineState & line_state, const Param & param, AllStateManager & all_manager); + + struct State : public HTMLTextState { + // before output + void begin(std::ostream & out, const State * prev_state); + // after output + void end(std::ostream & out) const; + // calculate the hash code + void hash(void); + // calculate the difference between another State + int diff(const State & s) const; + + enum { + FONT_ID, + FONT_SIZE_ID, + FILL_COLOR_ID, + STROKE_COLOR_ID, + LETTER_SPACE_ID, + WORD_SPACE_ID, + HASH_ID_COUNT, + + VERTICAL_ALIGN_ID = HASH_ID_COUNT, + ID_COUNT + }; + + static long long umask_by_id(int id); + + long long ids[ID_COUNT]; + + size_t start_idx; // index of the first Text using this state + // for optimization + long long hash_value; + long long hash_umask; // some states may not be actually used + bool need_close; + + static const char * const css_class_names []; // class names for each id + }; + + struct Offset { + Offset(size_t size_idx, double width) + :start_idx(size_idx),width(width) + { } + size_t start_idx; // should put this Offset right before text[start_idx]; + double width; + }; + + /** + * Append a drawn char (glyph)'s unicode. l > 1 mean this glyph correspond to + * multiple code points. + */ + void append_unicodes(const Unicode * u, int l, double width); + /** + * Append a special padding char with 0 width, in order to keep char index consistent. + * The padding char is ignored during output. + */ + void append_padding_char() { text.push_back(0); } + void append_offset(double width); + void append_state(const HTMLTextState & text_state); + void dump_text(std::ostream & out); + + bool text_empty(void) const { return text.empty(); } + void clear(void); + + void clip(const HTMLClipState &); + + /* + * Optimize and calculate necessary values + */ + void prepare(void); + void optimize(std::vector<HTMLTextLine*> &); +private: + void optimize_normal(std::vector<HTMLTextLine*> &); + void optimize_aggressive(std::vector<HTMLTextLine*> &); + + /** + * Dump chars' unicode to output stream. + * begin/pos is the index in 'text'. + */ + void dump_chars(std::ostream & out, int begin, int len); + void dump_char(std::ostream & out, int pos); + + const Param & param; + AllStateManager & all_manager; + + HTMLLineState line_state; + double ascent, descent; + double clip_x1, clip_y1; + double width; + + std::vector<State> states; + std::vector<Offset> offsets; + + /** + * Drawn chars (glyph) in this line are stored in 'text'. For each element c in 'text': + * - If c > 0, it is the unicode code point corresponds to the glyph; + * - If c == 0, it is a padding char, and ignored during output (TODO some bad PDFs utilize 0?); + * - If c < -1, this glyph corresponds to more than one unicode code points, + * which are stored in 'decomposed_text', and (-c-1) is the index in 'decomposed_text'. + */ + std::vector<int> text; + std::vector<std::vector<Unicode> > decomposed_text; +}; + +} // namespace pdf2htmlEX +#endif //HTMLTEXTLINE_H__ diff --git a/src/HTMLTextPage.cc b/src/HTMLTextPage.cc new file mode 100644 index 0000000..a8e2ab8 --- /dev/null +++ b/src/HTMLTextPage.cc @@ -0,0 +1,147 @@ +/* + * HTMLTextPage.cc + * + * Generate and optimized HTML for one Page + * + * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com> + */ + +#include "HTMLTextPage.h" +#include "util/css_const.h" + +namespace pdf2htmlEX { + +using std::ostream; + +HTMLTextPage::HTMLTextPage(const Param & param, AllStateManager & all_manager) + : param(param) + , all_manager(all_manager) + , cur_line(nullptr) + , page_width(0) + , page_height(0) +{ } + +HTMLTextPage::~HTMLTextPage() +{ + for(auto p : text_lines) + delete p; +} + +void HTMLTextPage::dump_text(ostream & out) +{ + if(param.optimize_text) + { + // text lines may be split during optimization, collect them + std::vector<HTMLTextLine*> new_text_lines; + for(auto p : text_lines) + p->optimize(new_text_lines); + std::swap(text_lines, new_text_lines); + } + for(auto p : text_lines) + p->prepare(); + if(param.optimize_text) + optimize(); + + HTMLClipState page_box; + page_box.xmin = page_box.ymin = 0; + page_box.xmax = page_width; + page_box.ymax = page_height; + + //push a dummy entry for convenience + clips.emplace_back(page_box, text_lines.size()); + + Clip cur_clip(page_box, 0); + bool has_clip = false; + + auto text_line_iter = text_lines.begin(); + for(auto clip_iter = clips.begin(); clip_iter != clips.end(); ++clip_iter) + { + auto next_text_line_iter = text_lines.begin() + clip_iter->start_idx; + if(text_line_iter != next_text_line_iter) + { + const auto & cs = cur_clip.clip_state; + if(has_clip) + { + out << "<div class=\"" << CSS::CLIP_CN + << " " << CSS::LEFT_CN << all_manager.left.install(cs.xmin) + << " " << CSS::BOTTOM_CN << all_manager.bottom.install(cs.ymin) + << " " << CSS::WIDTH_CN << all_manager.width.install(cs.xmax - cs.xmin) + << " " << CSS::HEIGHT_CN << all_manager.height.install(cs.ymax - cs.ymin) + << "\">"; + } + + while(text_line_iter != next_text_line_iter) + { + if(has_clip) + { + (*text_line_iter)->clip(cs); + } + (*text_line_iter)->dump_text(out); + ++text_line_iter; + } + if(has_clip) + { + out << "</div>"; + } + } + + { + cur_clip = *clip_iter; + const auto & cs = cur_clip.clip_state; + has_clip = !(equal(0, cs.xmin) && equal(0, cs.ymin) + && equal(page_width, cs.xmax) && equal(page_height, cs.ymax)); + } + } +} + +void HTMLTextPage::dump_css(ostream & out) +{ + //TODO +} + +void HTMLTextPage::clear(void) +{ + text_lines.clear(); + clips.clear(); + cur_line = nullptr; +} + +void HTMLTextPage::open_new_line(const HTMLLineState & line_state) +{ + // do not reused the last text_line even if it's empty + // because the clip states may point to the next index + text_lines.emplace_back(new HTMLTextLine(line_state, param, all_manager)); + cur_line = text_lines.back(); +} + +void HTMLTextPage::set_page_size(double width, double height) +{ + page_width = width; + page_height = height; +} + +void HTMLTextPage::clip(const HTMLClipState & clip_state) +{ + if(!clips.empty()) + { + auto & clip = clips.back(); + if(clip.start_idx == text_lines.size()) + { + /* + * Previous ClipBox is not used + */ + clip.clip_state = clip_state; + return; + } + } + clips.emplace_back(clip_state, text_lines.size()); +} + +void HTMLTextPage::optimize(void) +{ + //TODO + //group lines with same x-axis + //collect common states +} + +} // namespace pdf2htmlEX diff --git a/src/HTMLTextPage.h b/src/HTMLTextPage.h new file mode 100644 index 0000000..ccaa564 --- /dev/null +++ b/src/HTMLTextPage.h @@ -0,0 +1,66 @@ +/* + * Header file for HTMLTextPage + * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com> + */ + +#ifndef HTMLTEXTPAGE_H__ +#define HTMLTEXTPAGE_H__ + +#include <vector> +#include <ostream> + +#include "Param.h" +#include "StateManager.h" +#include "HTMLTextLine.h" +#include "HTMLState.h" + +namespace pdf2htmlEX { + +/* + * Store and optimize a page of text in HTML + * + * contains a series of HTMLTextLine + */ +class HTMLTextPage +{ +public: + HTMLTextPage (const Param & param, AllStateManager & all_manager); + ~HTMLTextPage(); + + HTMLTextLine * get_cur_line(void) const { return cur_line; } + + void dump_text(std::ostream & out); + void dump_css(std::ostream & out); + void clear(void); + + void open_new_line(const HTMLLineState & line_state); + + /* for clipping */ + void set_page_size(double width, double height); + void clip(const HTMLClipState & clip_state); + + double get_width() { return page_width; } + double get_height() { return page_height; } + +private: + void optimize(void); + + const Param & param; + AllStateManager & all_manager; + HTMLTextLine * cur_line; + double page_width, page_height; + + std::vector<HTMLTextLine*> text_lines; + + struct Clip { + HTMLClipState clip_state; + size_t start_idx; + Clip(const HTMLClipState & clip_state, size_t start_idx) + :clip_state(clip_state),start_idx(start_idx) + { } + }; + std::vector<Clip> clips; +}; + +} //namespace pdf2htmlEX +#endif //HTMLTEXTPAGE_H__ diff --git a/src/Param.h b/src/Param.h new file mode 100644 index 0000000..84fa426 --- /dev/null +++ b/src/Param.h @@ -0,0 +1,87 @@ +/* + * Parameters + * + * Wang Lu + * 2012.08.03 + */ + + +#ifndef PARAM_H__ +#define PARAM_H__ + +#include <string> + +namespace pdf2htmlEX { + +struct Param +{ + // pages + int first_page, last_page; + + // dimensions + double zoom; + double fit_width, fit_height; + int use_cropbox; + double h_dpi, v_dpi; + + // output + int embed_css; + int embed_font; + int embed_image; + int embed_javascript; + int embed_outline; + int split_pages; + std::string dest_dir; + std::string css_filename; + std::string page_filename; + std::string outline_filename; + int process_nontext; + int process_outline; + int process_annotation; + int process_form; + int correct_text_visibility; + int printing; + int fallback; + int tmp_file_size_limit; + + // fonts + int embed_external_font; + std::string font_format; + int decompose_ligature; + int auto_hint; + std::string external_hint_tool; + int stretch_narrow_glyph; + int squeeze_wide_glyph; + int override_fstype; + int process_type3; + + // text + double h_eps, v_eps; + double space_threshold; + double font_size_multiplier; + int space_as_offset; + int tounicode; + int optimize_text; + + // background image + std::string bg_format; + int svg_node_count_limit; + int svg_embed_bitmap; + + // encryption + std::string owner_password, user_password; + int no_drm; + + // misc. + int clean_tmp; + std::string data_dir; + std::string tmp_dir; + int debug; + int proof; + + std::string input_filename, output_filename; +}; + +} // namespace pdf2htmlEX + +#endif //PARAM_h__ diff --git a/src/Preprocessor.cc b/src/Preprocessor.cc new file mode 100644 index 0000000..a8859ad --- /dev/null +++ b/src/Preprocessor.cc @@ -0,0 +1,107 @@ +/* + * Preprocessor.cc + * + * Check used codes for each font + * + * by WangLu + * 2012.09.07 + */ + +#include <cstring> +#include <iostream> +#include <algorithm> + +#include <GfxState.h> +#include <GfxFont.h> + +#include "Preprocessor.h" +#include "util/misc.h" +#include "util/const.h" + +namespace pdf2htmlEX { + +using std::cerr; +using std::endl; +using std::flush; +using std::max; + +Preprocessor::Preprocessor(const Param & param) + : OutputDev() + , param(param) + , max_width(0) + , max_height(0) + , cur_font_id(0) + , cur_code_map(nullptr) +{ } + +Preprocessor::~Preprocessor(void) +{ + for(auto & p : code_maps) + delete [] p.second; +} + +void Preprocessor::process(PDFDoc * doc) +{ + int page_count = (param.last_page - param.first_page + 1); + for(int i = param.first_page; i <= param.last_page ; ++i) + { + cerr << "Preprocessing: " << (i-param.first_page) << "/" << page_count << '\r' << flush; + + doc->displayPage(this, i, DEFAULT_DPI, DEFAULT_DPI, + 0, + (!(param.use_cropbox)), + true, // crop + false, // printing + nullptr, nullptr, nullptr, nullptr); + } + if(page_count >= 0) + cerr << "Preprocessing: " << page_count << "/" << page_count; + cerr << endl; +} + +void Preprocessor::drawChar(GfxState *state, double x, double y, + double dx, double dy, + double originX, double originY, + CharCode code, int nBytes, Unicode *u, int uLen) +{ + GfxFont * font = state->getFont(); + if(!font) return; + + long long fn_id = hash_ref(font->getID()); + + if(fn_id != cur_font_id) + { + cur_font_id = fn_id; + auto p = code_maps.insert(std::make_pair(cur_font_id, (char*)nullptr)); + if(p.second) + { + // this is a new font + int len = font->isCIDFont() ? 0x10000 : 0x100; + p.first->second = new char [len]; + memset(p.first->second, 0, len * sizeof(char)); + } + + cur_code_map = p.first->second; + } + + cur_code_map[code] = 1; +} + +void Preprocessor::startPage(int pageNum, GfxState *state) +{ + startPage(pageNum, state, nullptr); +} + +void Preprocessor::startPage(int pageNum, GfxState *state, XRef * xref) +{ + max_width = max<double>(max_width, state->getPageWidth()); + max_height = max<double>(max_height, state->getPageHeight()); +} + +const char * Preprocessor::get_code_map (long long font_id) const +{ + auto iter = code_maps.find(font_id); + return (iter == code_maps.end()) ? nullptr : (iter->second); +} + +} // namespace pdf2htmlEX diff --git a/src/Preprocessor.h b/src/Preprocessor.h new file mode 100644 index 0000000..5b48e4f --- /dev/null +++ b/src/Preprocessor.h @@ -0,0 +1,66 @@ +/* + * Preprocessor.h + * + * PDF is so complicated that we have to scan twice + * + * Check used codes for each font + * Collect all used link destinations + * + * by WangLu + * 2012.09.07 + */ + + +#ifndef PREPROCESSOR_H__ +#define PREPROCESSOR_H__ + +#include <unordered_map> + +#include <OutputDev.h> +#include <PDFDoc.h> +#include <Annot.h> +#include "Param.h" + +namespace pdf2htmlEX { + +class Preprocessor : public OutputDev { +public: + Preprocessor(const Param & param); + virtual ~Preprocessor(void); + + void process(PDFDoc * doc); + + virtual GBool upsideDown() { return gFalse; } + virtual GBool useDrawChar() { return gTrue; } + virtual GBool interpretType3Chars() { return gFalse; } + virtual GBool needNonText() { return gFalse; } + virtual GBool needClipToCropBox() { return gTrue; } + + virtual void drawChar(GfxState *state, double x, double y, + double dx, double dy, + double originX, double originY, + CharCode code, int nBytes, Unicode *u, int uLen); + + // Start a page. + // UGLY: These 2 versions are for different versions of poppler + virtual void startPage(int pageNum, GfxState *state); + virtual void startPage(int pageNum, GfxState *state, XRef * xref); + + const char * get_code_map (long long font_id) const; + double get_max_width (void) const { return max_width; } + double get_max_height (void) const { return max_height; } + +protected: + const Param & param; + + double max_width, max_height; + + long long cur_font_id; + char * cur_code_map; + + std::unordered_map<long long, char*> code_maps; +}; + +} // namespace pdf2htmlEX + +#endif //PREPROCESSOR_H__ diff --git a/src/StateManager.h b/src/StateManager.h new file mode 100644 index 0000000..0a19df0 --- /dev/null +++ b/src/StateManager.h @@ -0,0 +1,430 @@ +/* + * StateManager.h + * + * manage reusable CSS classes + * + * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com> + */ + +#ifndef STATEMANAGER_H__ +#define STATEMANAGER_H__ + +#include <iostream> +#include <map> +#include <unordered_map> + +#include "Color.h" + +#include "util/math.h" +#include "util/css_const.h" + +namespace pdf2htmlEX { + +template<class ValueType, class Imp> class StateManager {}; + +template<class Imp> +class StateManager<double, Imp> +{ +public: + StateManager() + : eps(0) + , imp(static_cast<Imp*>(this)) + { } + + // values no farther than eps are treated as equal + void set_eps (double eps) { + this->eps = eps; + } + + double get_eps (void) const { + return eps; + } + + // install new_value into the map + // return the corresponding id + long long install(double new_value, double * actual_value_ptr = nullptr) { + auto iter = value_map.lower_bound(new_value - eps); + if((iter != value_map.end()) && (std::abs(iter->first - new_value) <= eps)) + { + if(actual_value_ptr != nullptr) + *actual_value_ptr = iter->first; + return iter->second; + } + + long long id = value_map.size(); + double v = value_map.insert(iter, std::make_pair(new_value, id))->first; + if(actual_value_ptr != nullptr) + *actual_value_ptr = v; + return id; + } + + void dump_css(std::ostream & out) { + for(auto & p : value_map) + { + out << "." << imp->get_css_class_name() << p.second << "{"; + imp->dump_value(out, p.first); + out << "}" << std::endl; + } + } + + void dump_print_css(std::ostream & out, double scale) { + for(auto & p : value_map) + { + out << "." << imp->get_css_class_name() << p.second << "{"; + imp->dump_print_value(out, p.first, scale); + out << "}" << std::endl; + } + } + +protected: + double eps; + Imp * imp; + std::map<double, long long> value_map; +}; + +// Be careful about the mixed usage of Matrix and const double * +// the input is usually double *, which might be changed, so we have to copy the content out +// in the map we use Matrix instead of double * such that the array may be automatically release when deconstructing +template <class Imp> +class StateManager<Matrix, Imp> +{ +public: + StateManager() + : imp(static_cast<Imp*>(this)) + { } + + // return id + long long install(const double * new_value) { + Matrix m; + memcpy(m.m, new_value, sizeof(m.m)); + auto iter = value_map.lower_bound(m); + if((iter != value_map.end()) && (tm_equal(m.m, iter->first.m, 4))) + { + return iter->second; + } + + long long id = value_map.size(); + value_map.insert(iter, std::make_pair(m, id)); + return id; + } + + void dump_css(std::ostream & out) { + for(auto & p : value_map) + { + out << "." << imp->get_css_class_name() << p.second << "{"; + imp->dump_value(out, p.first); + out << "}" << std::endl; + } + } + + void dump_print_css(std::ostream & out, double scale) {} + +protected: + Imp * imp; + + struct Matrix_less + { + bool operator () (const Matrix & m1, const Matrix & m2) const + { + // Note that we only care about the first 4 elements + for(int i = 0; i < 4; ++i) + { + if(m1.m[i] < m2.m[i]) + return true; + if(m1.m[i] > m2.m[i]) + return false; + } + return false; + } + }; + + std::map<Matrix, long long, Matrix_less> value_map; +}; + +template <class Imp> +class StateManager<Color, Imp> +{ +public: + StateManager() + : imp(static_cast<Imp*>(this)) + { } + + long long install(const Color & new_value) { + auto iter = value_map.find(new_value); + if(iter != value_map.end()) + { + return iter->second; + } + + long long id = value_map.size(); + value_map.insert(std::make_pair(new_value, id)); + return id; + } + + void dump_css(std::ostream & out) { + out << "." << imp->get_css_class_name() << CSS::INVALID_ID << "{"; + imp->dump_transparent(out); + out << "}" << std::endl; + + for(auto & p : value_map) + { + out << "." << imp->get_css_class_name() << p.second << "{"; + imp->dump_value(out, p.first); + out << "}" << std::endl; + } + } + + void dump_print_css(std::ostream & out, double scale) {} + +protected: + Imp * imp; + + struct Color_hash + { + size_t operator () (const Color & color) const + { + if(color.transparent) + { + return (~((size_t)0)); + } + else + { + return ( ((((size_t)colToByte(color.rgb.r)) & 0xff) << 16) + | ((((size_t)colToByte(color.rgb.g)) & 0xff) << 8) + | (((size_t)colToByte(color.rgb.b)) & 0xff) + ); + } + } + }; + + std::unordered_map<Color, long long, Color_hash> value_map; +}; + +///////////////////////////////////// +// Specific state managers + +class FontSizeManager : public StateManager<double, FontSizeManager> +{ +public: + static const char * get_css_class_name (void) { return CSS::FONT_SIZE_CN; } + double default_value(void) { return 0; } + void dump_value(std::ostream & out, double value) { out << "font-size:" << round(value) << "px;"; } + void dump_print_value(std::ostream & out, double value, double scale) { out << "font-size:" << round(value*scale) << "pt;"; } +}; + +class LetterSpaceManager : public StateManager<double, LetterSpaceManager> +{ +public: + static const char * get_css_class_name (void) { return CSS::LETTER_SPACE_CN; } + double default_value(void) { return 0; } + void dump_value(std::ostream & out, double value) { out << "letter-spacing:" << round(value) << "px;"; } + void dump_print_value(std::ostream & out, double value, double scale) { out << "letter-spacing:" << round(value*scale) << "pt;"; } +}; + +class WordSpaceManager : public StateManager<double, WordSpaceManager> +{ +public: + static const char * get_css_class_name (void) { return CSS::WORD_SPACE_CN;} + double default_value(void) { return 0; } + void dump_value(std::ostream & out, double value) { out << "word-spacing:" << round(value) << "px;"; } + void dump_print_value(std::ostream & out, double value, double scale) { out << "word-spacing:" << round(value*scale) << "pt;"; } +}; + +class VerticalAlignManager : public StateManager<double, VerticalAlignManager> +{ +public: + static const char * get_css_class_name (void) { return CSS::VERTICAL_ALIGN_CN; } + double default_value(void) { return 0; } + void dump_value(std::ostream & out, double value) { out << "vertical-align:" << round(value) << "px;"; } + void dump_print_value(std::ostream & out, double value, double scale) { out << "vertical-align:" << round(value*scale) << "pt;"; } +}; + +class WhitespaceManager : public StateManager<double, WhitespaceManager> +{ +public: + static const char * get_css_class_name (void) { return CSS::WHITESPACE_CN; } + double default_value(void) { return 0; } + void dump_value(std::ostream & out, double value) { + out << ((value > 0) ? "width:" + : "margin-left:") + << round(value) << "px;"; + } + void dump_print_value(std::ostream & out, double value, double scale) + { + value *= scale; + out << ((value > 0) ? "width:" + : "margin-left:") + << round(value) << "pt;"; + } +}; + +class WidthManager : public StateManager<double, WidthManager> +{ +public: + static const char * get_css_class_name (void) { return CSS::WIDTH_CN; } + double default_value(void) { return 0; } + void dump_value(std::ostream & out, double value) { out << "width:" << round(value) << "px;"; } + void dump_print_value(std::ostream & out, double value, double scale) { out << "width:" << round(value*scale) << "pt;"; } +}; + +class BottomManager : public StateManager<double, BottomManager> +{ +public: + static const char * get_css_class_name (void) { return CSS::BOTTOM_CN; } + double default_value(void) { return 0; } + void dump_value(std::ostream & out, double value) { out << "bottom:" << round(value) << "px;"; } + void dump_print_value(std::ostream & out, double value, double scale) { out << "bottom:" << round(value*scale) << "pt;"; } +}; + +class HeightManager : public StateManager<double, HeightManager> +{ +public: + static const char * get_css_class_name (void) { return CSS::HEIGHT_CN; } + double default_value(void) { return 0; } + void dump_value(std::ostream & out, double value) { out << "height:" << round(value) << "px;"; } + void dump_print_value(std::ostream & out, double value, double scale) { out << "height:" << round(value*scale) << "pt;"; } +}; + +class LeftManager : public StateManager<double, LeftManager> +{ +public: + static const char * get_css_class_name (void) { return CSS::LEFT_CN; } + double default_value(void) { return 0; } + void dump_value(std::ostream & out, double value) { out << "left:" << round(value) << "px;"; } + void dump_print_value(std::ostream & out, double value, double scale) { out << "left:" << round(value*scale) << "pt;"; } +}; + +class TransformMatrixManager : public StateManager<Matrix, TransformMatrixManager> +{ +public: + static const char * get_css_class_name (void) { return CSS::TRANSFORM_MATRIX_CN; } + const double * default_value(void) { return ID_MATRIX; } + void dump_value(std::ostream & out, const Matrix & matrix) { + // always ignore tm[4] and tm[5] because + // we have already shifted the origin + // TODO: recognize common matrices + const auto & m = matrix.m; + auto prefixes = {"", "-ms-", "-webkit-"}; + if(tm_equal(m, ID_MATRIX, 4)) + { + for(auto & s : prefixes) + out << s << "transform:none;"; + } + else + { + for(auto & s : prefixes) + { + // PDF use a different coordinate system from Web + out << s << "transform:matrix(" + << round(m[0]) << ',' + << round(-m[1]) << ',' + << round(-m[2]) << ',' + << round(m[3]) << ','; + out << "0,0);"; + } + } + } +}; + +class FillColorManager : public StateManager<Color, FillColorManager> +{ +public: + static const char * get_css_class_name (void) { return CSS::FILL_COLOR_CN; } + /* override base's method, as we need some workaround in CSS */ + void dump_css(std::ostream & out) { + for(auto & p : value_map) + { + out << "." << get_css_class_name() << p.second + << "{color:" << p.first << ";}" << std::endl; + } + } +}; + +class StrokeColorManager : public StateManager<Color, StrokeColorManager> +{ +public: + static const char * get_css_class_name (void) { return CSS::STROKE_COLOR_CN; } + /* override base's method, as we need some workaround in CSS */ + void dump_css(std::ostream & out) { + // normal CSS + out << "." << get_css_class_name() << CSS::INVALID_ID << "{text-shadow:none;}" << std::endl; + for(auto & p : value_map) + { + // TODO: take the stroke width from the graphics state, + // currently using 0.015em as a good default + out << "." << get_css_class_name() << p.second << "{text-shadow:" + << "-0.015em 0 " << p.first << "," + << "0 0.015em " << p.first << "," + << "0.015em 0 " << p.first << "," + << "0 -0.015em " << p.first << ";" + << "}" << std::endl; + } + // webkit + out << CSS::WEBKIT_ONLY << "{" << std::endl; + out << "." << get_css_class_name() << CSS::INVALID_ID << "{-webkit-text-stroke:0px transparent;}" << std::endl; + for(auto & p : value_map) + { + out << "." << get_css_class_name() << p.second + << "{-webkit-text-stroke:0.015em " << p.first << ";text-shadow:none;}" << std::endl; + } + out << "}" << std::endl; + } +}; + +///////////////////////////////////// +/* + * Manage the background image sizes + * + * We don't merge similar values, since they are bound with PAGE_CONTENT_BOX_number + */ +class BGImageSizeManager +{ +public: + void install(int page_no, double width, double height){ + value_map.insert(std::make_pair(page_no, std::make_pair(width, height))); + } + + void dump_css(std::ostream & out) { + for(auto & p : value_map) + { + const auto & s = p.second; + out << "." << CSS::PAGE_CONTENT_BOX_CN << p.first << "{"; + out << "background-size:" << round(s.first) << "px " << round(s.second) << "px;"; + out << "}" << std::endl; + } + } + + void dump_print_css(std::ostream & out, double scale) { + for(auto & p : value_map) + { + const auto & s = p.second; + out << "." << CSS::PAGE_CONTENT_BOX_CN << p.first << "{"; + out << "background-size:" << round(s.first * scale) << "pt " << round(s.second * scale) << "pt;"; + out << "}" << std::endl; + } + } + +private: + std::unordered_map<int, std::pair<double,double>> value_map; +}; + +struct AllStateManager +{ + TransformMatrixManager transform_matrix; + VerticalAlignManager vertical_align; + StrokeColorManager stroke_color; + LetterSpaceManager letter_space; + WhitespaceManager whitespace; + WordSpaceManager word_space; + FillColorManager fill_color; + FontSizeManager font_size; + BottomManager bottom; + HeightManager height; + WidthManager width; + LeftManager left; + BGImageSizeManager bgimage_size; +}; + +} // namespace pdf2htmlEX + +#endif //STATEMANAGER_H__ diff --git a/src/StringFormatter.cc b/src/StringFormatter.cc new file mode 100644 index 0000000..b361c2d --- /dev/null +++ b/src/StringFormatter.cc @@ -0,0 +1,30 @@ +#include <cstdarg> +#include <algorithm> +#include <cassert> + +#include "StringFormatter.h" + +namespace pdf2htmlEX { + +StringFormatter::GuardedPointer StringFormatter::operator () (const char * format, ...) +{ + assert((buf_cnt == 0) && "StringFormatter: buffer is reused!"); + + va_list vlist; + va_start(vlist, format); + int l = vsnprintf(&buf.front(), buf.capacity(), format, vlist); + va_end(vlist); + if(l >= (int)buf.capacity()) + { + buf.reserve(std::max<long>((long)(l+1), (long)buf.capacity() * 2)); + va_start(vlist, format); + l = vsnprintf(&buf.front(), buf.capacity(), format, vlist); + va_end(vlist); + } + assert(l >= 0); // we should fail when vsnprintf fail + assert(l < (int)buf.capacity()); + return GuardedPointer(this); +} + +} //namespace pdf2htmlEX + diff --git a/src/StringFormatter.h b/src/StringFormatter.h new file mode 100644 index 0000000..dd3f3c1 --- /dev/null +++ b/src/StringFormatter.h @@ -0,0 +1,43 @@ +/* + * Buffer reusing string formatter + * + * by WangLu + * 2012.11.29 + */ + +#ifndef STRINGFORMATTER_H__ +#define STRINGFORMATTER_H__ + +#include <vector> +#include <cstdio> + +namespace pdf2htmlEX { + +class StringFormatter +{ +public: + struct GuardedPointer + { + GuardedPointer(StringFormatter * sf) : sf(sf) { ++(sf->buf_cnt); } + GuardedPointer(const GuardedPointer & gp) : sf(gp.sf) { ++(sf->buf_cnt); } + ~GuardedPointer(void) { --(sf->buf_cnt); } + operator char* () const { return &(sf->buf.front()); } + private: + StringFormatter * sf; + }; + + StringFormatter() : buf_cnt(0) { buf.reserve(L_tmpnam); } + /* + * Important: + * there is only one buffer, so new strings will replace old ones + */ + GuardedPointer operator () (const char * format, ...); + +private: + friend class GuardedPointer; + std::vector<char> buf; + int buf_cnt; +}; + +} //namespace pdf2htmlEX +#endif //STRINGFORMATTER_H__ diff --git a/src/TmpFiles.cc b/src/TmpFiles.cc new file mode 100644 index 0000000..1184548 --- /dev/null +++ b/src/TmpFiles.cc @@ -0,0 +1,77 @@ +/* + * TmpFiles.cc + * + * Collect and clean-up temporary files + * + * implemented by WangLu + * split off by Filodej <philodej@gmail.com> + */ + +#include <iostream> +#include <cstdio> +#include <sys/stat.h> +#include <unistd.h> + +#include "TmpFiles.h" +#include "Param.h" + +#ifdef __MINGW32__ +#include "util/mingw.h" +#endif + +using namespace std; + +namespace pdf2htmlEX { + +TmpFiles::TmpFiles( const Param& param ) + : param( param ) +{ } + +TmpFiles::~TmpFiles() +{ + clean(); +} + +void TmpFiles::add( const string & fn) +{ + if(!param.clean_tmp) + return; + + if(tmp_files.insert(fn).second && param.debug) + cerr << "Add new temporary file: " << fn << endl; +} + +// Return the total size of the temporary files in bytes +double TmpFiles::get_total_size() const +{ + double total_size = 0; + struct stat st; + for(auto & fn : tmp_files) + { + stat(fn.c_str(), &st); + total_size += st.st_size; + } + + return total_size; +} + + +void TmpFiles::clean() +{ + if(!param.clean_tmp) + return; + + for(auto & fn : tmp_files) + { + remove(fn.c_str()); + if(param.debug) + cerr << "Remove temporary file: " << fn << endl; + } + + rmdir(param.tmp_dir.c_str()); + if(param.debug) + cerr << "Remove temporary directory: " << param.tmp_dir << endl; +} + +} // namespace pdf2htmlEX + diff --git a/src/TmpFiles.h b/src/TmpFiles.h new file mode 100644 index 0000000..277281d --- /dev/null +++ b/src/TmpFiles.h @@ -0,0 +1,28 @@ +#ifndef TMPFILES_H__ +#define TMPFILES_H__ + +#include <string> +#include <set> +#include "Param.h" + +namespace pdf2htmlEX { + +class TmpFiles +{ +public: + explicit TmpFiles( const Param& param ); + ~TmpFiles(); + + void add( const std::string& fn); + double get_total_size() const; + +private: + void clean(); + + const Param& param; + std::set<std::string> tmp_files; +}; + +} // namespace pdf2htmlEX + +#endif //TMPFILES_H__ diff --git a/src/css_class_names.cmakelists.txt b/src/css_class_names.cmakelists.txt new file mode 100644 index 0000000..067d95a --- /dev/null +++ b/src/css_class_names.cmakelists.txt @@ -0,0 +1,39 @@ +# vim: filetype=cmake : +# CSS class names + +# Note +# don't use: (otherwise conflicted with others when there is an ID suffix) +# p f s + +set(CSS_INVALID_ID "_") + +set(CSS_LINE_CN "t") # Text +set(CSS_TRANSFORM_MATRIX_CN "m") # Matrix +set(CSS_CLIP_CN "c") # Clip + +set(CSS_PAGE_FRAME_CN "pf") # Page Frame +set(CSS_PAGE_CONTENT_BOX_CN "pc") # Page Content +set(CSS_PAGE_DATA_CN "pi") # Page Info + +set(CSS_BACKGROUND_IMAGE_CN "bi") # Background Image +set(CSS_FULL_BACKGROUND_IMAGE_CN "bf") # Background image (Full) + +set(CSS_FONT_FAMILY_CN "ff") # Font Family +set(CSS_FONT_SIZE_CN "fs") # Font Size + +set(CSS_FILL_COLOR_CN "fc") # Fill Color +set(CSS_STROKE_COLOR_CN "sc") # Stroke Color + +set(CSS_LETTER_SPACE_CN "ls") # Letter Space +set(CSS_WORD_SPACE_CN "ws") # Word Space +set(CSS_VERTICAL_ALIGN_CN "v") # Vertical align +set(CSS_WHITESPACE_CN "_") # whitespace +set(CSS_LEFT_CN "x") # X +set(CSS_HEIGHT_CN "h") # Height +set(CSS_WIDTH_CN "w") # Width +set(CSS_BOTTTOM_CN "y") # Y +set(CSS_CSS_DRAW_CN "d") # Draw +set(CSS_LINK_CN "l") # Link +set(CSS_INPUT_TEXT_CN "it") # Text input +set(CSS_INPUT_RADIO_CN "ir") # Radio button +set(CSS_RADIO_CHECKED_CN "checked") # Show picture of checked out radio button diff --git a/src/pdf2htmlEX-config.h.in b/src/pdf2htmlEX-config.h.in new file mode 100644 index 0000000..7c9b510 --- /dev/null +++ b/src/pdf2htmlEX-config.h.in @@ -0,0 +1,24 @@ +/* + * config.h + * Compile time constants + * + * Copyright (C) 2012-2014 Lu Wang <coolwanglu@gmail.com> + */ + + +#ifndef PDF2HTMLEX_CONFIG_H__ +#define PDF2HTMLEX_CONFIG_H__ + +#include <string> + +#define ENABLE_SVG @ENABLE_SVG@ + +namespace pdf2htmlEX { + +static const std::string PDF2HTMLEX_VERSION = "@PDF2HTMLEX_VERSION@"; +static const std::string PDF2HTMLEX_PREFIX = "@CMAKE_INSTALL_PREFIX@"; +static const std::string PDF2HTMLEX_DATA_PATH = "@CMAKE_INSTALL_PREFIX@""/share/pdf2htmlEX"; + +} // namespace pdf2htmlEX + +#endif //PDF2HTMLEX_CONFIG_H__ diff --git a/src/pdf2htmlEX.cc b/src/pdf2htmlEX.cc new file mode 100644 index 0000000..b56e8e9 --- /dev/null +++ b/src/pdf2htmlEX.cc @@ -0,0 +1,445 @@ +// pdf2htmlEX.cc +// +// Copyright (C) 2012-2015 Lu Wang <coolwanglu@gmail.com> + +#include <cstdio> +#include <cstdlib> +#include <cstddef> +#include <cstring> +#include <ctime> +#include <string> +#include <limits> +#include <iostream> +#include <memory> +#include <errno.h> + +#include <getopt.h> + +#include <poppler-config.h> +#include <goo/GooString.h> + +#include <Object.h> +#include <PDFDoc.h> +#include <PDFDocFactory.h> +#include <GlobalParams.h> + +#include "pdf2htmlEX-config.h" + +#if ENABLE_SVG +#include <cairo.h> +#endif + +#include "ArgParser.h" +#include "Param.h" +#include "HTMLRenderer/HTMLRenderer.h" + +#include "util/path.h" +#include "util/ffw.h" + +#ifdef __MINGW32__ +#include "util/mingw.h" +#endif + +using namespace std; +using namespace pdf2htmlEX; + +Param param; +ArgParser argparser; + +void show_usage_and_exit(const char * dummy = nullptr) +{ + cerr << "Usage: pdf2htmlEX [options] <input.pdf> [<output.html>]" << endl; + argparser.show_usage(cerr); + exit(EXIT_FAILURE); +} + +void show_version_and_exit(const char * dummy = nullptr) +{ + cerr << "pdf2htmlEX version " << PDF2HTMLEX_VERSION << endl; + cerr << "Copyright 2012-2015 Lu Wang <coolwanglu@gmail.com> and other contributors" << endl; + cerr << "Libraries: " << endl; + cerr << " poppler " << POPPLER_VERSION << endl; + cerr << " libfontforge " << ffw_get_version() << endl; +#if ENABLE_SVG + cerr << " cairo " << cairo_version_string() << endl; +#endif + cerr << "Default data-dir: " << param.data_dir << endl; + cerr << "Supported image format:"; +#ifdef ENABLE_LIBPNG + cerr << " png"; +#endif +#ifdef ENABLE_LIBJPEG + cerr << " jpg"; +#endif +#if ENABLE_SVG + cerr << " svg"; +#endif + cerr << endl; + + cerr << endl; + exit(EXIT_SUCCESS); +} + +void embed_parser (const char * str) +{ + while(true) + { + switch(*str) + { + case '\0': return; break; + case 'c': param.embed_css = 0; break; + case 'C': param.embed_css = 1; break; + case 'f': param.embed_font = 0; break; + case 'F': param.embed_font = 1; break; + case 'i': param.embed_image = 0; break; + case 'I': param.embed_image = 1; break; + case 'j': param.embed_javascript = 0; break; + case 'J': param.embed_javascript = 1; break; + case 'o': param.embed_outline = 0; break; + case 'O': param.embed_outline = 1; break; + default: + cerr << "Unknown character `" << (*str) << "` for --embed" << endl; + break; + } + ++ str; + } +} + +void prepare_directories() +{ + std::string tmp_dir = param.tmp_dir + "/pdf2htmlEX-XXXXXX"; + + errno = 0; + + unique_ptr<char> pBuf(new char[tmp_dir.size() + 1]); + strcpy(pBuf.get(), tmp_dir.c_str()); + auto p = mkdtemp(pBuf.get()); + if(p == nullptr) + { + const char * errmsg = strerror(errno); + if(!errmsg) + { + errmsg = "unknown error"; + } + cerr << "Cannot create temp directory: " << errmsg << endl; + exit(EXIT_FAILURE); + } + param.tmp_dir = pBuf.get(); +} + +void parse_options (int argc, char **argv) +{ + argparser + // pages + .add("first-page,f", ¶m.first_page, 1, "first page to convert") + .add("last-page,l", ¶m.last_page, numeric_limits<int>::max(), "last page to convert") + + // dimensions + .add("zoom", ¶m.zoom, 0, "zoom ratio", true) + .add("fit-width", ¶m.fit_width, 0, "fit width to <fp> pixels", true) + .add("fit-height", ¶m.fit_height, 0, "fit height to <fp> pixels", true) + .add("use-cropbox", ¶m.use_cropbox, 1, "use CropBox instead of MediaBox") + .add("hdpi", ¶m.h_dpi, 144.0, "horizontal resolution for graphics in DPI") + .add("vdpi", ¶m.v_dpi, 144.0, "vertical resolution for graphics in DPI") + + // output files + .add("embed", "specify which elements should be embedded into output", embed_parser, true) + .add("embed-css", ¶m.embed_css, 1, "embed CSS files into output") + .add("embed-font", ¶m.embed_font, 1, "embed font files into output") + .add("embed-image", ¶m.embed_image, 1, "embed image files into output") + .add("embed-javascript", ¶m.embed_javascript, 1, "embed JavaScript files into output") + .add("embed-outline", ¶m.embed_outline, 1, "embed outlines into output") + .add("split-pages", ¶m.split_pages, 0, "split pages into separate files") + .add("dest-dir", ¶m.dest_dir, ".", "specify destination directory") + .add("css-filename", ¶m.css_filename, "", "filename of the generated css file") + .add("page-filename", ¶m.page_filename, "", "filename template for split pages ") + .add("outline-filename", ¶m.outline_filename, "", "filename of the generated outline file") + .add("process-nontext", ¶m.process_nontext, 1, "render graphics in addition to text") + .add("process-outline", ¶m.process_outline, 1, "show outline in HTML") + .add("process-annotation", ¶m.process_annotation, 0, "show annotation in HTML") + .add("process-form", ¶m.process_form, 0, "include text fields and radio buttons") + .add("printing", ¶m.printing, 1, "enable printing support") + .add("fallback", ¶m.fallback, 0, "output in fallback mode") + .add("tmp-file-size-limit", ¶m.tmp_file_size_limit, -1, "Maximum size (in KB) used by temporary files, -1 for no limit.") + + // fonts + .add("embed-external-font", ¶m.embed_external_font, 1, "embed local match for external fonts") + .add("font-format", ¶m.font_format, "woff", "suffix for embedded font files (ttf,otf,woff,svg)") + .add("decompose-ligature", ¶m.decompose_ligature, 0, "decompose ligatures, such as \uFB01 -> fi") + .add("auto-hint", ¶m.auto_hint, 0, "use fontforge autohint on fonts without hints") + .add("external-hint-tool", ¶m.external_hint_tool, "", "external tool for hinting fonts (overrides --auto-hint)") + .add("stretch-narrow-glyph", ¶m.stretch_narrow_glyph, 0, "stretch narrow glyphs instead of padding them") + .add("squeeze-wide-glyph", ¶m.squeeze_wide_glyph, 1, "shrink wide glyphs instead of truncating them") + .add("override-fstype", ¶m.override_fstype, 0, "clear the fstype bits in TTF/OTF fonts") + .add("process-type3", ¶m.process_type3, 0, "convert Type 3 fonts for web (experimental)") + + // text + .add("heps", ¶m.h_eps, 1.0, "horizontal threshold for merging text, in pixels") + .add("veps", ¶m.v_eps, 1.0, "vertical threshold for merging text, in pixels") + .add("space-threshold", ¶m.space_threshold, (1.0/8), "word break threshold (threshold * em)") + .add("font-size-multiplier", ¶m.font_size_multiplier, 4.0, "a value greater than 1 increases the rendering accuracy") + .add("space-as-offset", ¶m.space_as_offset, 0, "treat space characters as offsets") + .add("tounicode", ¶m.tounicode, 0, "how to handle ToUnicode CMaps (0=auto, 1=force, -1=ignore)") + .add("optimize-text", ¶m.optimize_text, 0, "try to reduce the number of HTML elements used for text") + .add("correct-text-visibility", ¶m.correct_text_visibility, 0, "try to detect texts covered by other graphics and properly arrange them") + + // background image + .add("bg-format", ¶m.bg_format, "png", "specify background image format") + .add("svg-node-count-limit", ¶m.svg_node_count_limit, -1, "if node count in a svg background image exceeds this limit," + " fall back this page to bitmap background; negative value means no limit.") + .add("svg-embed-bitmap", ¶m.svg_embed_bitmap, 1, "1: embed bitmaps in svg background; 0: dump bitmaps to external files if possible.") + + // encryption + .add("owner-password,o", ¶m.owner_password, "", "owner password (for encrypted files)", true) + .add("user-password,u", ¶m.user_password, "", "user password (for encrypted files)", true) + .add("no-drm", ¶m.no_drm, 0, "override document DRM settings") + + // misc. + .add("clean-tmp", ¶m.clean_tmp, 1, "remove temporary files after conversion") + .add("tmp-dir", ¶m.tmp_dir, param.tmp_dir, "specify the location of temporary directory.") + .add("data-dir", ¶m.data_dir, param.data_dir, "specify data directory") + .add("debug", ¶m.debug, 0, "print debugging information") + .add("proof", ¶m.proof, 0, "texts are drawn on both text layer and background for proof.") + + // meta + .add("version,v", "print copyright and version info", &show_version_and_exit) + .add("help,h", "print usage information", &show_usage_and_exit) + + .add("", ¶m.input_filename, "", "") + .add("", ¶m.output_filename, "", "") + ; + + try + { + argparser.parse(argc, argv); + } + catch(const char * s) + { + // if s == "", getopt_long would have printed the error message + if(s && s[0]) + { + cerr << "Error when parsing the arguments:" << endl; + cerr << s << endl; + } + exit(EXIT_FAILURE); + } + catch(const std::string & s) + { + // if s == "", getopt_long would have printed the error message + if(s != "") + { + cerr << "Error when parsing the arguments:" << endl; + cerr << s << endl; + } + exit(EXIT_FAILURE); + } +} + +void check_param() +{ + if (param.input_filename == "") + { + show_usage_and_exit(); + } + + if(param.output_filename.empty()) + { + const string s = get_filename(param.input_filename); + if(get_suffix(param.input_filename) == ".pdf") + { + param.output_filename = s.substr(0, s.size() - 4) + ".html"; + } + else + { + param.output_filename = s + ".html"; + } + } + + if(param.page_filename.empty()) + { + const string s = get_filename(param.input_filename); + if(get_suffix(param.input_filename) == ".pdf") + { + param.page_filename = s.substr(0, s.size() - 4) + "%d.page"; + } + else + { + param.page_filename = s + "%d.page"; + } + sanitize_filename(param.page_filename); + } + + else + { + // Need to make sure we have a page number placeholder in the filename + if(!sanitize_filename(param.page_filename)) + { + // Inject the placeholder just before the file extension + const string suffix = get_suffix(param.page_filename); + param.page_filename = param.page_filename.substr(0, param.page_filename.size() - suffix.size()) + "%d" + suffix; + sanitize_filename(param.page_filename); + } + } + if(param.css_filename.empty()) + { + const string s = get_filename(param.input_filename); + + if(get_suffix(param.input_filename) == ".pdf") + { + param.css_filename = s.substr(0, s.size() - 4) + ".css"; + } + else + { + param.css_filename = s + ".css"; + } + } + if(param.outline_filename.empty()) + { + const string s = get_filename(param.input_filename); + + if(get_suffix(param.input_filename) == ".pdf") + { + param.outline_filename = s.substr(0, s.size() - 4) + ".outline"; + } + else + { + if(!param.split_pages) + param.outline_filename = s + ".outline"; + } + } + + if(false) { } +#ifdef ENABLE_LIBPNG + else if (param.bg_format == "png") { } +#endif +#ifdef ENABLE_LIBJPEG + else if (param.bg_format == "jpg") { } +#endif +#if ENABLE_SVG + else if(param.bg_format == "svg") { } +#endif + else + { + cerr << "Image format not supported: " << param.bg_format << endl; + exit(EXIT_FAILURE); + } + +#if not ENABLE_SVG + if(param.process_type3) + { + cerr << "process-type3 is enabled, however SVG support is not built in this version of pdf2htmlEX." << endl; + exit(EXIT_FAILURE); + } +#endif + + if((param.font_format == "ttf") && (param.external_hint_tool == "")) + { + cerr << "Warning: No hint tool is specified for truetype fonts, the result may be rendered poorly in some circumstances." << endl; + } + + if (param.embed_image && (param.bg_format == "svg") && !param.svg_embed_bitmap) + { + cerr << "Warning: --svg-embed-bitmap is forced on because --embed-image is on, or the dumped bitmaps can't be loaded." << endl; + param.svg_embed_bitmap = 1; + } +} + +int main(int argc, char **argv) +{ + // We need to adjust these directories before parsing the options. +#if defined(__MINGW32__) + param.data_dir = get_exec_dir(argv[0]); + param.tmp_dir = get_tmp_dir(); +#else + char const* tmp = getenv("TMPDIR"); +#ifdef P_tmpdir + if (!tmp) + tmp = P_tmpdir; +#endif +#ifdef _PATH_TMP + if (!tmp) + tmp = _PATH_TMP; +#endif + if (!tmp) + tmp = "/tmp"; + param.tmp_dir = string(tmp); + param.data_dir = PDF2HTMLEX_DATA_PATH; +#endif + + parse_options(argc, argv); + check_param(); + + //prepare the directories + prepare_directories(); + + if(param.debug) + cerr << "temporary dir: " << (param.tmp_dir) << endl; + + try + { + create_directories(param.dest_dir); + } + catch (const string & s) + { + cerr << s << endl; + exit(EXIT_FAILURE); + } + + bool finished = false; + // read config file + globalParams = new GlobalParams(); + // open PDF file + PDFDoc * doc = nullptr; + try + { + { + GooString * ownerPW = (param.owner_password == "") ? (nullptr) : (new GooString(param.owner_password.c_str())); + GooString * userPW = (param.user_password == "") ? (nullptr) : (new GooString(param.user_password.c_str())); + GooString fileName(param.input_filename.c_str()); + + doc = PDFDocFactory().createPDFDoc(fileName, ownerPW, userPW); + + delete userPW; + delete ownerPW; + } + + if (!doc->isOk()) + throw "Cannot read the file"; + + // check for copy permission + if (!doc->okToCopy()) + { + if (param.no_drm == 0) + throw "Copying of text from this document is not allowed."; + cerr << "Document has copy-protection bit set." << endl; + } + + param.first_page = min<int>(max<int>(param.first_page, 1), doc->getNumPages()); + param.last_page = min<int>(max<int>(param.last_page, param.first_page), doc->getNumPages()); + + + unique_ptr<HTMLRenderer>(new HTMLRenderer(param))->process(doc); + + finished = true; + } + catch (const char * s) + { + cerr << "Error: " << s << endl; + } + catch (const string & s) + { + cerr << "Error: " << s << endl; + } + + // clean up + delete doc; + delete globalParams; + + // check for memory leaks + Object::memCheck(stderr); + gMemReport(stderr); + + exit(finished ? (EXIT_SUCCESS) : (EXIT_FAILURE)); + + return 0; +} diff --git a/src/util/const.cc b/src/util/const.cc new file mode 100644 index 0000000..c85e0d5 --- /dev/null +++ b/src/util/const.cc @@ -0,0 +1,53 @@ +/* + * Constants + * + * by WangLu + * 2012.11.29 + */ + +#include "const.h" + +namespace pdf2htmlEX { + +using std::map; +using std::string; + +const double ID_MATRIX[6] = {1.0, 0.0, 0.0, 1.0, 0.0, 0.0}; + +const map<string, string> GB_ENCODED_FONT_NAME_MAP({ + {"\xCB\xCE\xCC\xE5", "SimSun"}, + {"\xBA\xDA\xCC\xE5", "SimHei"}, + {"\xBF\xAC\xCC\xE5_GB2312", "SimKai"}, + {"\xB7\xC2\xCB\xCE_GB2312", "SimFang"}, + {"\xC1\xA5\xCA\xE9", "SimLi"}, +}); + +const std::map<std::string, EmbedStringEntry> EMBED_STRING_MAP({ + {".css", {&Param::embed_css, + "<style type=\"text/css\">", + "</style>", false, + "<link rel=\"stylesheet\" href=\"", + "\"/>" }}, + {".js", {&Param::embed_javascript, + "<script>", + "</script>", false, + "<script src=\"", + "\"></script>" }}, + {".png", {&Param::embed_image, + "<img alt=\"\" src=\"data:image/png;base64,", + "\"/>", true, + "<img alt=\"\" src=\"", + "\"/>" }} +}); + +const std::map<std::string, std::string> FORMAT_MIME_TYPE_MAP({ + {"eot", "application/vnd.ms-fontobject"}, + {"jpg", "image/jpeg"}, + {"otf", "application/x-font-otf"}, + {"png", "image/png"}, + {"svg", "image/svg+xml"}, + {"ttf", "application/x-font-ttf"}, + {"woff", "application/font-woff"}, +}); + +} //namespace pdf2htmlEX diff --git a/src/util/const.h b/src/util/const.h new file mode 100644 index 0000000..db29a5c --- /dev/null +++ b/src/util/const.h @@ -0,0 +1,46 @@ +/* + * Constants + * + * by WangLu + * 2012.11.29 + */ + +#ifndef CONST_H__ +#define CONST_H__ + +#include <map> +#include <string> + +#include "Param.h" + +namespace pdf2htmlEX { + +#ifndef nullptr +#define nullptr (NULL) +#endif + +static const double EPS = 1e-6; +static const double DEFAULT_DPI = 72.0; +extern const double ID_MATRIX[6]; + +// For GB encoded font names +extern const std::map<std::string, std::string> GB_ENCODED_FONT_NAME_MAP; +// map to embed files into html +struct EmbedStringEntry +{ + int Param::*embed_flag; + // used when *embed_flag == true + std::string prefix_embed; + std::string suffix_embed; + bool base64_encode; + // used when *embed_flag == false + std::string prefix_external; + std::string suffix_external; +}; +extern const std::map<std::string, EmbedStringEntry> EMBED_STRING_MAP; + +extern const std::map<std::string, std::string> FORMAT_MIME_TYPE_MAP; + +} // namespace pdf2htmlEX + +#endif //CONST_H__ diff --git a/src/util/css_const.h.in b/src/util/css_const.h.in new file mode 100644 index 0000000..08c23fc --- /dev/null +++ b/src/util/css_const.h.in @@ -0,0 +1,67 @@ +/* vim: set filetype=cpp : */ +/* + * css_const.h + * + * Copyright (C) 2013 Lu Wang <coolwanglu@gmail.com> + */ + +#ifndef CSS_CONST_H__ +#define CSS_CONST_H__ + + +/* + * should be consistent with base.css and pdf2htmlEX.js + */ + +namespace pdf2htmlEX { +namespace CSS { + +// work around strings +const char * const WEBKIT_ONLY = "@media screen and (-webkit-min-device-pixel-ratio:0)"; +const char * const PRINT_ONLY = "@media print"; + +// usually the class name is XXX_CN or XXX_CN<hex id> +// sometimes we need a special one, e.g. transparent color, where the id is -1 +const char * const INVALID_ID = "@CSS_INVALID_ID@"; + +const char * const LINE_CN = "@CSS_LINE_CN@"; +const char * const TRANSFORM_MATRIX_CN = "@CSS_TRANSFORM_MATRIX_CN@"; +const char * const CLIP_CN = "@CSS_CLIP_CN@"; + +// page_decoration is for shadow etc +// page_frame cannot have margin or border-width, pdf2htmlEX.js will use it to determine the coordinates +// page_content holds everything inside the page, could be hidden to speed up rendering +// page_data holds data for pdf2htmlEX.js +const char * const PAGE_DECORATION_CN = "@CSS_PAGE_DECORATION_CN@"; +const char * const PAGE_FRAME_CN = "@CSS_PAGE_FRAME_CN@"; +const char * const PAGE_CONTENT_BOX_CN = "@CSS_PAGE_CONTENT_BOX_CN@"; +const char * const PAGE_DATA_CN = "@CSS_PAGE_DATA_CN@"; + +const char * const BACKGROUND_IMAGE_CN = "@CSS_BACKGROUND_IMAGE_CN@"; +const char * const FULL_BACKGROUND_IMAGE_CN = "@CSS_FULL_BACKGROUND_IMAGE_CN@"; + +const char * const FONT_FAMILY_CN = "@CSS_FONT_FAMILY_CN@"; +const char * const FONT_SIZE_CN = "@CSS_FONT_SIZE_CN@"; +const char * const FILL_COLOR_CN = "@CSS_FILL_COLOR_CN@"; +const char * const STROKE_COLOR_CN = "@CSS_STROKE_COLOR_CN@"; +const char * const LETTER_SPACE_CN = "@CSS_LETTER_SPACE_CN@"; +const char * const WORD_SPACE_CN = "@CSS_WORD_SPACE_CN@"; +const char * const VERTICAL_ALIGN_CN = "@CSS_VERTICAL_ALIGN_CN@"; +const char * const WHITESPACE_CN = "@CSS_WHITESPACE_CN@"; +const char * const LEFT_CN = "@CSS_LEFT_CN@"; +const char * const HEIGHT_CN = "@CSS_HEIGHT_CN@"; +const char * const WIDTH_CN = "@CSS_WIDTH_CN@"; +const char * const BOTTOM_CN = "@CSS_BOTTTOM_CN@"; + +const char * const CSS_DRAW_CN = "@CSS_CSS_DRAW_CN@"; +const char * const LINK_CN = "@CSS_LINK_CN@"; + +const char * const INPUT_TEXT_CN = "@CSS_INPUT_TEXT_CN@"; +const char * const INPUT_RADIO_CN = "@CSS_INPUT_RADIO_CN@"; +const char * const RADIO_CHECKED_CN = "@CSS_RADIO_CHECKED_CN@"; + +} +} + + +#endif //CSS_CONST_H__ diff --git a/src/util/encoding.cc b/src/util/encoding.cc new file mode 100644 index 0000000..6b600bc --- /dev/null +++ b/src/util/encoding.cc @@ -0,0 +1,182 @@ +/* + * Encodings used in HTML + * + * by WangLu + * 2013.02.15 + */ + +#include <cstring> + +#include "encoding.h" +#include "const.h" // for nullptr + +namespace pdf2htmlEX { + +using std::ostream; +using std::string; + +/* + * Copied from UTF.h / UTF8.h in poppler + */ +static int mapUTF8(Unicode u, char *buf, int bufSize) +{ + if (u <= 0x0000007f) { + if (bufSize < 1) { + return 0; + } + buf[0] = (char)u; + return 1; + } else if (u <= 0x000007ff) { + if (bufSize < 2) { + return 0; + } + buf[0] = (char)(0xc0 + (u >> 6)); + buf[1] = (char)(0x80 + (u & 0x3f)); + return 2; + } else if (u <= 0x0000ffff) { + if (bufSize < 3) { + return 0; + } + buf[0] = (char)(0xe0 + (u >> 12)); + buf[1] = (char)(0x80 + ((u >> 6) & 0x3f)); + buf[2] = (char)(0x80 + (u & 0x3f)); + return 3; + } else if (u <= 0x0010ffff) { + if (bufSize < 4) { + return 0; + } + buf[0] = (char)(0xf0 + (u >> 18)); + buf[1] = (char)(0x80 + ((u >> 12) & 0x3f)); + buf[2] = (char)(0x80 + ((u >> 6) & 0x3f)); + buf[3] = (char)(0x80 + (u & 0x3f)); + return 4; + } else { + return 0; + } +} + +void writeUnicodes(ostream & out, const Unicode * u, int uLen) +{ + for(int i = 0; i < uLen; ++i) + { + switch(u[i]) + { + case '&': + out << "&"; + break; + case '\"': + out << """; + break; + case '\'': + out << "'"; + break; + case '<': + out << "<"; + break; + case '>': + out << ">"; + break; + default: + { + char buf[4]; + auto n = mapUTF8(u[i], buf, 4); + out.write(buf, n); + } + } + } +} + +/* +static void writeHEX(ostream & out, char c) +{ + static const char * hexchars = "0123456789abcdef"; + out << hexchars[(c>>4)&0xf] << hexchars[c&0xf]; +} + +void writeURL(ostream & out, const string & s) +{ + static char * dont_escape = nullptr; + if(!dont_escape) + { + dont_escape = new char [256]; + memset(dont_escape, 0, 256 * sizeof(char)); + / * + * http://tools.ietf.org/html/rfc3986#section-2 + * + * Also includes '%', in case that the original url has been escaped + * / + const char * no_escape_chars = ":/?#[]@!$&'()*+,;=" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789" + "-._~" + "%"; + while(*no_escape_chars) + dont_escape[(int)*(no_escape_chars++)] = 1; + } + + for (auto iter = s.begin(); iter != s.end(); ++iter) + { + char c = *iter; + if(dont_escape[(int)c]) + out << c; + else + { + out << '%'; + writeHEX(out, c); + } + } +} +*/ + +void writeJSON(ostream & out, const string & s) +{ + for(auto c : s) + { + switch (c) + { + case '\\': out << "\\\\"; break; + case '"': out << "\\\""; break; + case '\'': out << "\\\'"; break; + case '/': out << "\\/"; break; + case '\b': out << "\\b"; break; + case '\f': out << "\\f"; break; + case '\n': out << "\\n"; break; + case '\r': out << "\\r"; break; + case '\t': out << "\\t"; break; + default: out << c; break; + } + } +} + +void writeAttribute(std::ostream & out, const std::string & s) +{ + for (auto c : s) + { + switch(c) + { + case '&': + out << "&"; + break; + case '\"': + out << """; + break; + case '\'': + out << "'"; + break; + case '<': + out << "<"; + break; + case '>': + out << ">"; + break; + case '`': // for IE: http://html5sec.org/#59 + out << "`"; + break; + default: + out << c; + } + } +} + +} //namespace pdf2htmlEX diff --git a/src/util/encoding.h b/src/util/encoding.h new file mode 100644 index 0000000..c4d7732 --- /dev/null +++ b/src/util/encoding.h @@ -0,0 +1,41 @@ +/* + * Encodings used in HTML + * + * by WangLu + * 2013.02.15 + */ + +#ifndef ENCODING_H__ +#define ENCODING_H__ + +#include <string> +#include <iostream> + +#include <CharTypes.h> + +namespace pdf2htmlEX { + +/* + * Escape necessary characters, and map Unicode to UTF-8 + */ +void writeUnicodes(std::ostream & out, const Unicode * u, int uLen); + + +/* + * URL escaping + */ +//void writeURL(std::ostream & out, const std::string & s); + +/* + * JSON escaping + */ +void writeJSON(std::ostream & out, const std::string & s); + +/* + * HTML tag attribute escaping + */ +void writeAttribute(std::ostream & out, const std::string & s); + +} // namespace pdf2htmlEX + +#endif //ENCODING_H__ diff --git a/src/util/ffw.c b/src/util/ffw.c new file mode 100644 index 0000000..b88efce --- /dev/null +++ b/src/util/ffw.c @@ -0,0 +1,485 @@ +/* + * ffw.c: Fontforge wrapper + * + * Processing fonts using Fontforge + * + * Copyright (C) 2012-2014 Lu Wang <coolwanglu@gmail.com> + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <stdarg.h> +#include <assert.h> +#include <math.h> + +#include <fontforge.h> +#include <baseviews.h> + +#include "ffw.h" + +static real EPS=1e-6; + +static inline int min(int a, int b) +{ + return (a<b)?a:b; +} + +static FontViewBase * cur_fv = NULL; +static Encoding * original_enc = NULL; +static Encoding * unicodefull_enc = NULL; +static Encoding * enc_head = NULL; + +static void err(const char * format, ...) +{ + va_list al; + va_start(al, format); + vfprintf(stderr, format, al); + va_end(al); + exit(-1); +} +static char * strcopy(const char * str) +{ + if(str == NULL) return NULL; + + char * _ = strdup(str); + if(!_) + err("Not enough memory"); + return _; +} + +static void dumb_logwarning(const char * format, ...) { } + +static void dumb_post_error(const char * title, const char * error, ...) { } + +void ffw_init(int debug) +{ + InitSimpleStuff(); + if ( default_encoding==NULL ) + default_encoding=FindOrMakeEncoding("ISO8859-1"); + if ( default_encoding==NULL ) + default_encoding=&custom; /* In case iconv is broken */ + + if(!debug) + { + //disable error output of Fontforge + ui_interface->logwarning = &dumb_logwarning; + ui_interface->post_error = &dumb_post_error; + } + + original_enc = FindOrMakeEncoding("original"); + unicodefull_enc = FindOrMakeEncoding("UnicodeFull"); + + { + Val v; + v.type = v_int; + v.u.ival = 1; + SetPrefs("DetectDiagonalStems", &v, NULL); + } +} + +void ffw_finalize(void) +{ + while(enc_head) + { + Encoding * next = enc_head->next; + free((void*)enc_head->enc_name); + free(enc_head->unicode); + if(enc_head->psnames) + { + int i; + for(i = 0; i < enc_head->char_cnt; ++i) + free((void*)enc_head->psnames[i]); + free(enc_head->psnames); + } + free(enc_head); + enc_head = next; + } +} + +long ffw_get_version(void) +{ + return FONTFORGE_VERSIONDATE_RAW; +} + +void ffw_new_font() +{ + assert((cur_fv == NULL) && "Previous font is not destroyed"); + cur_fv = FVAppend(_FontViewCreate(SplineFontNew())); +} + +void ffw_load_font(const char * filename) +{ + assert((cur_fv == NULL) && "Previous font is not destroyed"); + + char * _filename = strcopy(filename); + SplineFont * font = LoadSplineFont(_filename, 1); + + free(_filename); + + if(!font) + err("Cannot load font %s\n", filename); + + if(!font->fv) + FVAppend(_FontViewCreate(font)); + + assert(font->fv); + + cur_fv = font->fv; +} + +/* + * Fight again dirty stuffs + */ +void ffw_prepare_font(void) +{ + memset(cur_fv->selected, 1, cur_fv->map->enccount); + // remove kern + FVRemoveKerns(cur_fv); + FVRemoveVKerns(cur_fv); + + /* + * Remove Alternate Unicodes + * We never use them because we will do a force encoding + */ + int i; + SplineFont * sf = cur_fv->sf; + for(i = 0; i < sf->glyphcnt; ++i) + { + SplineChar * sc = sf->glyphs[i]; + if(sc) + { + struct altuni * p = sc->altuni; + if(p) + { + AltUniFree(p); + sc->altuni = NULL; + } + } + } + + /* + * Wipe out font name + * browsers may rejects fonts with malformed font names + */ + free(sf->fontname); + sf->fontname = strcopy(""); +} + +void ffw_save(const char * filename) +{ + char * _filename = strcopy(filename); + char * _ = strcopy(""); + + int r = GenerateScript(cur_fv->sf, _filename + , _, -1, -1, NULL, NULL, cur_fv->map, NULL, ly_fore); + + free(_); + free(_filename); + + if(!r) + err("Cannot save font to %s\n", filename); +} +void ffw_close(void) +{ + FontViewClose(cur_fv); + cur_fv = NULL; +} + +static void ffw_do_reencode(Encoding * encoding, int force) +{ + assert(encoding); + + if(force) + { + SFForceEncoding(cur_fv->sf, cur_fv->map, encoding); + } + else + { + EncMapFree(cur_fv->map); + cur_fv->map = EncMapFromEncoding(cur_fv->sf, encoding); + } + if(cur_fv->normal) + { + EncMapFree(cur_fv->normal); + cur_fv->normal = NULL; + } + + SFReplaceEncodingBDFProps(cur_fv->sf, cur_fv->map); + + free(cur_fv->selected); + cur_fv->selected = calloc(cur_fv->map->enccount, sizeof(char)); +} + +void ffw_reencode_glyph_order(void) +{ + ffw_do_reencode(original_enc, 0); +} + +void ffw_reencode_unicode_full(void) +{ + ffw_do_reencode(unicodefull_enc, 0); +} + +void ffw_reencode(const char * encname, int force) +{ + Encoding * enc = FindOrMakeEncoding(encname); + if(!enc) + err("Unknown encoding %s\n", encname); + + ffw_do_reencode(enc, force); +} + +void ffw_reencode_raw(int32 * mapping, int mapping_len, int force) +{ + Encoding * enc = calloc(1, sizeof(Encoding)); + enc->only_1byte = enc->has_1byte = true; + + int len = (mapping_len < 256) ? 256 : mapping_len; + enc->char_cnt = len; + enc->unicode = (int32_t*)malloc(len * sizeof(int32_t)); + memcpy(enc->unicode, mapping, mapping_len * sizeof(int32_t)); + if(mapping_len < 256) + { + int i; + for(i = mapping_len; i < 256; ++i) + enc->unicode[i] = -1; + } + + enc->enc_name = strcopy(""); + + enc->next = enc_head; + enc_head = enc; + + ffw_do_reencode(enc, force); +} + +void ffw_reencode_raw2(char ** mapping, int mapping_len, int force) +{ + Encoding * enc = calloc(1, sizeof(Encoding)); + enc->enc_name = strcopy(""); + enc->char_cnt = mapping_len; + enc->unicode = (int32_t*)malloc(mapping_len * sizeof(int32_t)); + enc->psnames = (char**)calloc(mapping_len, sizeof(char*)); + int i; + for(i = 0; i < mapping_len; ++i) + { + if(mapping[i]) + { + enc->unicode[i] = UniFromName(mapping[i], ui_none, &custom); + enc->psnames[i] = strcopy(mapping[i]); + } + else + { + enc->unicode[i] = -1; + } + } + + enc->next = enc_head; + enc_head = enc; + + ffw_do_reencode(enc, force); +} + +void ffw_cidflatten(void) +{ + if(!cur_fv->sf->cidmaster) + { + fprintf(stderr, "Cannot flatten a non-CID font\n"); + return; + } + SFFlatten(cur_fv->sf->cidmaster); +} + +/* + * There is no check if a glyph with the same unicode exists! + * TODO: let FontForge fill in the standard glyph name <- or maybe this might cause collision? + */ +void ffw_add_empty_char(int32_t unicode, int width) +{ + SplineChar * sc = SFMakeChar(cur_fv->sf, cur_fv->map, cur_fv->map->enccount); + char buffer[400]; + SCSetMetaData(sc, + strcopy(StdGlyphName(buffer, unicode, + cur_fv->sf->uni_interp, cur_fv->sf->for_new_glyphs)), + unicode, sc->comment); + SCSynchronizeWidth(sc, width, sc->width, cur_fv); +} + +int ffw_get_em_size(void) +{ + return cur_fv->sf->ascent + cur_fv->sf->descent; +} + +void ffw_fix_metric() +{ + double ascent, descent; + ffw_get_metric(&ascent, &descent); + ffw_set_metric(ascent, descent); +} + +void ffw_get_metric(double * ascent, double * descent) +{ + SplineFont * sf = cur_fv->sf; + + DBounds bb; + SplineFontFindBounds(sf, &bb); + + int em = sf->ascent + sf->descent; + + if (em > 0) + { + *ascent = ((double)bb.maxy) / em; + *descent = ((double)bb.miny) / em; + } + else + { + *ascent = *descent = 0; + } +} + +void ffw_set_metric(double ascent, double descent) +{ + SplineFont * sf = cur_fv->sf; + struct pfminfo * info = &sf->pfminfo; + + SFDefaultOS2Info(info, sf, sf->fontname); + info->pfmset = 1; + sf->changed = 1; + + int em = sf->ascent + sf->descent; + int a = floor(ascent * em + 0.5); + int d = floor(descent * em + 0.5); + + if(a < 0) a = 0; + if(d > 0) d = 0; + + /* + sf->ascent = min(a, em); + sf->descent = em - bb.maxy; + */ + + /* + * The embedded fonts are likely to have inconsistent values for the 3 sets of ascent/descent + * PDF viewers don't care, since they don't even use these values + * But have to unify them, for different browsers on different platforms + * Things may become easier when there are CSS rules for baseline-based positioning. + */ + info->os2_winascent = a; + info->os2_typoascent = a; + info->hhead_ascent = a; + info->winascent_add = 0; + info->typoascent_add = 0; + info->hheadascent_add = 0; + + info->os2_windescent = -d; + info->os2_typodescent = d; + info->hhead_descent = d; + info->windescent_add = 0; + info->typodescent_add = 0; + info->hheaddescent_add = 0; + + info->os2_typolinegap = 0; + info->linegap = 0; +} + +/* + * TODO:bitmap, reference have not been considered in this function + */ +void ffw_set_widths(int * width_list, int mapping_len, + int stretch_narrow, int squeeze_wide) +{ + SplineFont * sf = cur_fv->sf; + + if(sf->onlybitmaps + && cur_fv->active_bitmap != NULL + && sf->bitmaps != NULL) + { + printf("TODO: width vs bitmap\n"); + } + + EncMap * map = cur_fv->map; + int i; + int imax = min(mapping_len, map->enccount); + for(i = 0; i < imax; ++i) + { + /* + * Don't mess with it if the glyphs is not used. + */ + if(width_list[i] == -1) + { + continue; + } + + int j = map->map[i]; + if(j == -1) continue; + + SplineChar * sc = sf->glyphs[j]; + if(sc == NULL) + { + sc = SFMakeChar(cur_fv->sf, cur_fv->map, j); + } + else if(((sc->width > EPS) + && (((sc->width > width_list[i] + EPS) && (squeeze_wide)) + || ((sc->width < width_list[i] - EPS) && (stretch_narrow))))) + { + real transform[6]; + transform[0] = ((double)width_list[i]) / (sc->width); + transform[3] = 1.0; + transform[1] = transform[2] = transform[4] = transform[5] = 0; + FVTrans(cur_fv, sc, transform, NULL, fvt_alllayers | fvt_dontmovewidth); + } + + SCSynchronizeWidth(sc, width_list[i], sc->width, cur_fv); + } +} + +void ffw_import_svg_glyph(int code, const char * filename, double ox, double oy, double width) +{ + int enc = SFFindSlot(cur_fv->sf, cur_fv->map, code, ""); + if(enc == -1) + return; + + SplineChar * sc = SFMakeChar(cur_fv->sf, cur_fv->map, enc); + + memset(cur_fv->selected, 0, cur_fv->map->enccount); + cur_fv->selected[enc] = 1; + int ok = FVImportImages(cur_fv, (char*)filename, fv_svg, 0, -1); + if(!ok) + err("Import SVG glyph failed"); + + // correct origin and width + { + int a = cur_fv->sf->ascent; + int d = cur_fv->sf->descent; + real transform[6]; + transform[0] = 1.0; + transform[3] = 1.0; + transform[1] = transform[2] = 0.0; + transform[4] = -ox * (a+d); + transform[5] = -oy * (a+d) + d; + FVTrans(cur_fv, sc, transform, NULL, fvt_alllayers | fvt_dontmovewidth); + + SCSynchronizeWidth(sc, floor(width * (a+d) + 0.5), sc->width, cur_fv); + } +} + +void ffw_auto_hint(void) +{ + // convert to quadratic + if(!(cur_fv->sf->layers[ly_fore].order2)) + { + SFCloseAllInstrs(cur_fv->sf); + SFConvertToOrder2(cur_fv->sf); + } + memset(cur_fv->selected, 1, cur_fv->map->enccount); + FVAutoHint(cur_fv); + FVAutoInstr(cur_fv); +} + +void ffw_override_fstype(void) +{ + *(int16 *)(&cur_fv->sf->pfminfo.fstype) = 0; + cur_fv->sf->pfminfo.pfmset = true; + cur_fv->sf->changed = true; +} diff --git a/src/util/ffw.h b/src/util/ffw.h new file mode 100644 index 0000000..a01ed79 --- /dev/null +++ b/src/util/ffw.h @@ -0,0 +1,74 @@ +/* + * ffw.h : Fontforge Wrapper + * + * Processing fonts using Fontforge + * + * fontforge.h cannot be included in C++ + * So this wrapper in C publishes several functions we need + * + * by WangLu + * 2012.09.03 + */ + + +#ifdef __cplusplus +#include <cstdint> +namespace pdf2htmlEX { +extern "C" { +#else +#include <stdint.h> +#endif + +//////////////////////// +// global +void ffw_init(int debug); +void ffw_finalize(void); +long ffw_get_version(void); + +//////////////////////// +// load & save +void ffw_new_font(); +void ffw_load_font(const char * filename); +void ffw_prepare_font(void); + +void ffw_save(const char * filename); +void ffw_close(void); + +//////////////////////// +// encoding +void ffw_reencode_glyph_order(void); +void ffw_reencode_unicode_full(void); +void ffw_reencode_raw(int32_t * mapping, int mapping_len, int force); +void ffw_reencode_raw2(char ** mapping, int mapping_len, int force); + +void ffw_cidflatten(void); +// add a new empty char into the font +void ffw_add_empty_char(int32_t unicode, int width); + +//////////////////////// +// metrics +int ffw_get_em_size(void); +// manipulate ascent and descent +// ascent is between 0 and 1 +// descent is between -1 and 0 +void ffw_fix_metric(); +// get ascent/descent based on the shape +void ffw_get_metric(double * ascent, double * descent); +// set corresponding fields +void ffw_set_metric(double ascent, double descent); + +void ffw_set_widths(int * width_list, int mapping_len, + int stretch_narrow, int squeeze_wide); + +//////////////////////// +// others +// (ox,oy) is the position of the true origin, fractions related to em_size +// also true for glyph_width +void ffw_import_svg_glyph(int code, const char * filename, double ox, double oy, double glyph_width); +void ffw_auto_hint(void); +void ffw_override_fstype(void); + +#ifdef __cplusplus +} +} +#endif diff --git a/src/util/math.cc b/src/util/math.cc new file mode 100644 index 0000000..1ddabce --- /dev/null +++ b/src/util/math.cc @@ -0,0 +1,90 @@ +#include <cstring> +#include <limits> +#include <algorithm> + +#include "math.h" + +using std::min; +using std::max; + +namespace pdf2htmlEX { + +void tm_transform(const double * tm, double & x, double & y, bool is_delta) +{ + double xx = x, yy = y; + x = tm[0] * xx + tm[2] * yy; + y = tm[1] * xx + tm[3] * yy; + if(!is_delta) + { + x += tm[4]; + y += tm[5]; + } +} + +void tm_multiply(double * tm_left, const double * tm_right) +{ + double old[4]; + memcpy(old, tm_left, sizeof(old)); + + tm_left[0] = old[0] * tm_right[0] + old[2] * tm_right[1]; + tm_left[1] = old[1] * tm_right[0] + old[3] * tm_right[1]; + tm_left[2] = old[0] * tm_right[2] + old[2] * tm_right[3]; + tm_left[3] = old[1] * tm_right[2] + old[3] * tm_right[3]; + tm_left[4] += old[0] * tm_right[4] + old[2] * tm_right[5]; + tm_left[5] += old[1] * tm_right[4] + old[3] * tm_right[5]; +} + +void tm_transform_bbox(const double * tm, double * bbox) +{ + double & x1 = bbox[0]; + double & y1 = bbox[1]; + double & x2 = bbox[2]; + double & y2 = bbox[3]; + double _[4][2]; + _[0][0] = _[1][0] = x1; + _[0][1] = _[2][1] = y1; + _[2][0] = _[3][0] = x2; + _[1][1] = _[3][1] = y2; + + x1 = y1 = std::numeric_limits<double>::max(); + x2 = y2 = std::numeric_limits<double>::min(); + for(int i = 0; i < 4; ++i) + { + auto & x = _[i][0]; + auto & y = _[i][1]; + tm_transform(tm, x, y); + if(x < x1) x1 = x; + if(x > x2) x2 = x; + if(y < y1) y1 = y; + if(y > y2) y2 = y; + } +} + +bool bbox_intersect(const double * bbox1, const double * bbox2, double * result) +{ + double x0, y0, x1, y1; + + x0 = max(min(bbox1[0], bbox1[2]), min(bbox2[0], bbox2[2])); + x1 = min(max(bbox1[0], bbox1[2]), max(bbox2[0], bbox2[2])); + + if (x0 >= x1) + return false; + + y0 = max(min(bbox1[1], bbox1[3]), min(bbox2[1], bbox2[3])); + y1 = min(max(bbox1[1], bbox1[3]), max(bbox2[1], bbox2[3])); + + if (y0 >= y1) + return false; + + if (result) + { + result[0] = x0; + result[1] = y0; + result[2] = x1; + result[3] = y1; + } + return true; +} + +} //namespace pdf2htmlEX + diff --git a/src/util/math.h b/src/util/math.h new file mode 100644 index 0000000..8302a93 --- /dev/null +++ b/src/util/math.h @@ -0,0 +1,59 @@ +/* + * Math functions + * + * by WangLu + * 2012.11.29 + */ + +#ifndef MATH_H__ +#define MATH_H__ + +#include <cmath> + +#include "const.h" + +namespace pdf2htmlEX { + +static inline double round(double x) { return (std::abs(x) > EPS) ? x : 0.0; } +static inline bool equal(double x, double y) { return std::abs(x-y) <= EPS; } +static inline bool is_positive(double x) { return x > EPS; } +static inline bool tm_equal(const double * tm1, const double * tm2, int size = 6) +{ + for(int i = 0; i < size; ++i) + if(!equal(tm1[i], tm2[i])) + return false; + return true; +} + +static inline void tm_init(double * tm) +{ + tm[0] = tm[3] = 1; + tm[1] = tm[2] = tm[4] = tm[5] = 0; +} + +static inline void tm_multiply(double * result, const double * m1, const double * m2) +{ + result[0] = m1[0] * m2[0] + m1[2] * m2[1]; + result[1] = m1[1] * m2[0] + m1[3] * m2[1]; + result[2] = m1[0] * m2[2] + m1[2] * m2[3]; + result[3] = m1[1] * m2[2] + m1[3] * m2[3]; + result[4] = m1[0] * m2[4] + m1[2] * m2[5] + m1[4]; + result[5] = m1[1] * m2[4] + m1[3] * m2[5] + m1[5]; +} + +static inline double hypot(double x, double y) { return std::sqrt(x*x+y*y); } + +void tm_transform(const double * tm, double & x, double & y, bool is_delta = false); +void tm_multiply(double * tm_left, const double * tm_right); +void tm_transform_bbox(const double * tm, double * bbox); +/** + * Calculate the intersection of 2 boxes. + * If they are intersecting, store the result to result (if not null) and return true. + * Otherwise return false, and result is not touched. + * Param result can be same as one of bbox1 and bbox2. + * Data in boxes are expected in the order of (x0, y0, x1, y1). + */ +bool bbox_intersect(const double * bbox1, const double * bbox2, double * result = nullptr); + +} //namespace pdf2htmlEX +#endif //MATH_H__ diff --git a/src/util/mingw.cc b/src/util/mingw.cc new file mode 100644 index 0000000..5d75be0 --- /dev/null +++ b/src/util/mingw.cc @@ -0,0 +1,64 @@ +/* + * Win32 specific functions + * + * by MarcSanfacon + * 2014.01.13 + */ + +#ifdef __MINGW32__ + +#include <string> +#include <iostream> +#include <cstdio> +#include <cstdlib> +#include <limits.h> +#include <libgen.h> + +#include "mingw.h" + +using namespace std; + +char* mkdtemp(char* temp) +{ + char *filename = nullptr; + if (temp != nullptr) { + filename = mktemp(temp); + if (filename != nullptr) { + if (_mkdir(temp) != 0) { + filename = nullptr; + } + } + } + + return filename; +} + +namespace pdf2htmlEX { +string get_exec_dir(char *dir) +{ + // Under Windows, the default data_dir is under /data in the pdf2htmlEX directory + string s = dirname(dir); + if (s == ".") { + char* wd(getcwd(nullptr, PATH_MAX)); + s = wd; + free(wd); + } + s += "/data"; + return s; +} + +string get_tmp_dir() +{ + // Under Windows, the temp path is not under /tmp, find it. + char *tmp = getenv("TMP"); + if (tmp == nullptr) { + tmp = getenv("TEMP"); + } + + return tmp != nullptr ? string(tmp) + "/" : "/"; +} + +} // namespace pdf2htmlEX; + +#endif //__MINGW32__ + diff --git a/src/util/mingw.h b/src/util/mingw.h new file mode 100644 index 0000000..89abf8a --- /dev/null +++ b/src/util/mingw.h @@ -0,0 +1,29 @@ +/* + * Win32 specific functions + * + * by MarcSanfacon + * 2014.01.13 + */ + +#ifndef MINGW_H__ +#define MINGW_H__ + +#ifdef __MINGW32__ + +#include <io.h> + +char *mkdtemp(char *temp); + +#include <direct.h> +#define mkdir(A, B) _mkdir(A) +#define stat _stat + +namespace pdf2htmlEX { + std::string get_exec_dir(char *dir); + std::string get_tmp_dir(); +} // namespace pdf2htmlEX + +#endif //__MINGW32__ + +#endif //MINGW_H__ + diff --git a/src/util/misc.cc b/src/util/misc.cc new file mode 100644 index 0000000..e2572c0 --- /dev/null +++ b/src/util/misc.cc @@ -0,0 +1,66 @@ +/* + * Misc functions + * + * + * by WangLu + * 2012.08.10 + */ + +#include <map> + +#include "misc.h" + +using std::cerr; +using std::endl; +using std::string; +using std::map; +using std::ostream; + +namespace pdf2htmlEX { + +void css_fix_rectangle_border_width(double x1, double y1, + double x2, double y2, + double border_width, + double & x, double & y, double & w, double & h, + double & border_top_bottom_width, + double & border_left_right_width) +{ + w = x2 - x1; + if(w > border_width) + { + w -= border_width; + border_left_right_width = border_width; + } + else + { + border_left_right_width = border_width + w/2; + w = 0; + } + x = x1 - border_width / 2; + + h = y2 - y1; + if(h > border_width) + { + h -= border_width; + border_top_bottom_width = border_width; + } + else + { + border_top_bottom_width = border_width + h/2; + h = 0; + } + y = y1 - border_width / 2; +} + +ostream & operator << (ostream & out, const GfxRGB & rgb) +{ + auto flags= out.flags(); + out << std::dec << "rgb(" + << (int)colToByte(rgb.r) << "," + << (int)colToByte(rgb.g) << "," + << (int)colToByte(rgb.b) << ")"; + out.flags(flags); + return out; +} + +} // namespace pdf2htmlEX diff --git a/src/util/misc.h b/src/util/misc.h new file mode 100644 index 0000000..9032e4e --- /dev/null +++ b/src/util/misc.h @@ -0,0 +1,39 @@ +/* + * Help classes and Functions + * + * by WangLu + * 2012.08.10 + */ + + +#ifndef UTIL_H__ +#define UTIL_H__ + +#include <iostream> + +#include <GfxState.h> + +#include "util/const.h" + +namespace pdf2htmlEX { + +static inline long long hash_ref(const Ref * id) +{ + return (((long long)(id->num)) << (sizeof(id->gen)*8)) | (id->gen); +} + +/* + * In PDF, edges of the rectangle are in the middle of the borders + * In HTML, edges are completely outside the rectangle + */ +void css_fix_rectangle_border_width(double x1, double y1, double x2, double y2, + double border_width, + double & x, double & y, double & w, double & h, + double & border_top_bottom_width, + double & border_left_right_width); + +std::ostream & operator << (std::ostream & out, const GfxRGB & rgb); + +} // namespace pdf2htmlEX + +#endif //UTIL_H__ diff --git a/src/util/namespace.h b/src/util/namespace.h new file mode 100644 index 0000000..46dcd0f --- /dev/null +++ b/src/util/namespace.h @@ -0,0 +1,21 @@ +/* + * namespace.h + * + * specifying common used namespace + * + * by WangLu + */ + +#ifndef NAMESPACE_H__ +#define NAMESPACE_H__ + +using std::hex; +using std::dec; +using std::string; +using std::endl; +using std::make_pair; +using std::ifstream; +using std::ofstream; + +#endif // NAMESPACE_H__ + diff --git a/src/util/path.cc b/src/util/path.cc new file mode 100644 index 0000000..5abc7a5 --- /dev/null +++ b/src/util/path.cc @@ -0,0 +1,141 @@ +/* + * Functions manipulating filenames and paths + * + * by WangLu + * 2012.11.29 + */ + +#include <errno.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <cstring> + +#include "path.h" + +#ifdef __MINGW32__ +#include "util/mingw.h" +#endif + +using std::string; + +namespace pdf2htmlEX { + +void create_directories(const string & path) +{ + if(path.empty()) return; + + size_t idx = path.rfind('/'); + if(idx != string::npos) + { + create_directories(path.substr(0, idx)); + } + + int r = mkdir(path.c_str(), S_IRWXU); + if(r != 0) + { + if(errno == EEXIST) + { + struct stat stat_buf; + if((stat(path.c_str(), &stat_buf) == 0) && S_ISDIR(stat_buf.st_mode)) + return; + } + + throw string("Cannot create directory: ") + path; + } +} + +bool sanitize_filename(string & filename) +{ + string sanitized; + bool format_specifier_found = false; + + for(size_t i = 0; i < filename.size(); i++) + { + if('%' == filename[i]) + { + if(format_specifier_found) + { + sanitized.push_back('%'); + sanitized.push_back('%'); + } + else + { + // We haven't found the format specifier yet, so see if we can use this one as a valid formatter + size_t original_i = i; + string tmp; + tmp.push_back('%'); + while(++i < filename.size()) + { + tmp.push_back(filename[i]); + + // If we aren't still in option specifiers, stop looking + if(!strchr("0123456789", filename[i])) + { + break; + } + } + + // Check to see if we yielded a valid format specifier + if('d' == tmp[tmp.size()-1]) + { + // Found a valid integer format + sanitized.append(tmp); + format_specifier_found = true; + } + else + { + // Not a valid format specifier. Just append the protected % + // and keep looking from where we left of in the search + sanitized.push_back('%'); + sanitized.push_back('%'); + i = original_i; + } + } + } + else + { + sanitized.push_back(filename[i]); + } + } + + // Only sanitize if it is a valid format. + if(format_specifier_found) + { + filename.assign(sanitized); + } + + return format_specifier_found; +} + +bool is_truetype_suffix(const string & suffix) +{ + return (suffix == ".ttf") || (suffix == ".ttc") || (suffix == ".otf"); +} + +string get_filename (const string & path) +{ + size_t idx = path.rfind('/'); + if(idx == string::npos) + return path; + else if (idx == path.size() - 1) + return ""; + return path.substr(idx + 1); +} + +string get_suffix(const string & path) +{ + string fn = get_filename(path); + size_t idx = fn.rfind('.'); + if(idx == string::npos) + return ""; + else + { + string s = fn.substr(idx); + for(auto & c : s) + c = tolower(c); + return s; + } +} + + +} //namespace pdf2htmlEX diff --git a/src/util/path.h b/src/util/path.h new file mode 100644 index 0000000..2a2a685 --- /dev/null +++ b/src/util/path.h @@ -0,0 +1,33 @@ +/* + * Function handling filenames and paths + * + * by WangLu + * 2012.11.29 + */ + +#ifndef PATH_H__ +#define PATH_H__ + +#include <string> + +namespace pdf2htmlEX { + +void create_directories(const std::string & path); + +bool is_truetype_suffix(const std::string & suffix); + +std::string get_filename(const std::string & path); +std::string get_suffix(const std::string & path); + +/** + * Sanitize all occurrences of '%' except for the first valid format specifier. Filename + * is only sanitized if a formatter is found, and the function returns true. + * + * @param filename the filename to be sanitized. Value will be modified. + * + * @return true if a format specifier was found, false otherwise. + */ +bool sanitize_filename(std::string & filename); + +} //namespace pdf2htmlEX +#endif //PATH_H__ diff --git a/src/util/unicode.cc b/src/util/unicode.cc new file mode 100644 index 0000000..4a2a034 --- /dev/null +++ b/src/util/unicode.cc @@ -0,0 +1,70 @@ +/* + * Unicode manipulation functions + * + * Copyright (C) 2012-2014 Lu Wang <coolwanglu@gmail.com> + */ + +#include <iostream> + +#include <GlobalParams.h> + +#include "pdf2htmlEX-config.h" + +#include "unicode.h" + +namespace pdf2htmlEX { + +using std::cerr; +using std::endl; +using std::ostream; + +Unicode map_to_private(CharCode code) +{ + Unicode private_mapping = (Unicode)(code + 0xE000); + if(private_mapping > 0xF8FF) + { + private_mapping = (Unicode)((private_mapping - 0xF8FF) + 0xF0000); + if(private_mapping > 0xFFFFD) + { + private_mapping = (Unicode)((private_mapping - 0xFFFFD) + 0x100000); + if(private_mapping > 0x10FFFD) + { + cerr << "Warning: all private use unicode are used" << endl; + } + } + } + return private_mapping; +} + +Unicode unicode_from_font (CharCode code, GfxFont * font) +{ + if(!font->isCIDFont()) + { + char * cname = dynamic_cast<Gfx8BitFont*>(font)->getCharName(code); + // may be untranslated ligature + if(cname) + { + Unicode ou = globalParams->mapNameToUnicodeText(cname); + if(!is_illegal_unicode(ou)) + return ou; + } + } + + return map_to_private(code); +} + +Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font) +{ + if(len == 0) + return map_to_private(code); + + if(len == 1) + { + if(!is_illegal_unicode(*u)) + return *u; + } + + return unicode_from_font(code, font); +} + +} //namespace pdf2htmlEX diff --git a/src/util/unicode.h b/src/util/unicode.h new file mode 100644 index 0000000..2100695 --- /dev/null +++ b/src/util/unicode.h @@ -0,0 +1,84 @@ +/* + * Unicode manipulation functions + * + * by WangLu + * 2012.11.29 + */ + +#ifndef UNICODE_H__ +#define UNICODE_H__ + +#include <GfxFont.h> +#include <CharTypes.h> + +namespace pdf2htmlEX { + +/** + * Check whether a unicode character is illegal for the output HTML. + * Unlike PDF readers, browsers has special treatments for such characters (normally treated as + * zero-width space), regardless of metrics and glyphs provided by fonts. So these characters + * should be mapped to unicode private area to "cheat" browsers, at the cost of loosing actual + * unicode values in the HTML. + * + * The following chart shows illegal characters in HTML by webkit, mozilla, and pdf2htmlEX (p2h). + * pdf2htmlEX's illegal character set is the union of webkit's and mozilla's, plus illegal unicode + * characters. "[" and ")" surrounding ranges denote "inclusive" and "exclusive", respectively. + * + * 00(NUL)--09(\t)--0A(\n)--0D(\r)--20(SP)--7F(DEL)--9F(APC)--A0(NBSP)--AD(SHY)--061C(ALM)--1361(Ethiopic word space) + * webkit: [--------------------------------) [------------------) [-] + * moz: [--------------------------------) [---------] [-] + * p2h: [--------------------------------) [------------------] [-] [-] [-] + * + * 200B(ZWSP)--200C(ZWNJ)--200D(ZWJ)--200E(LRM)--200F(RLM)--2028(LSEP)--2029(PSEP)--202A(LRE)--202E(RL0)--2066(LRI)--2069(PDI) + * webkit: [-----------------------------------------------] [----------] + * moz: [-] [----------] [-] [-] [----------] [------------] + * p2h: [-----------------------------------------------] [-] [-] [----------] [------------] + * + * D800(surrogate)--DFFF(surrogate)--FEFF(ZWNBSP)--FFFC(ORC)--FFFE(non-char)--FFFF(non-char) + * webkit: [-] [-] + * moz: + * p2h: [------------------] [-] [-] [-----------------] + * + * Note: 0xA0 (no-break space) affects word-spacing; and if "white-space:pre" is specified, + * \n and \r can break line, \t can shift text, so they are considered illegal. + * + * Resources (retrieved at 2015-03-16) + * * webkit + * * Avoid querying the font cache for the zero-width space glyph ( https://bugs.webkit.org/show_bug.cgi?id=90673 ) + * * treatAsZeroWidthSpace( https://github.com/WebKit/webkit/blob/17bbff7400393e9389b40cc84ce005f7cc954680/Source/WebCore/platform/graphics/FontCascade.h#L272 ) + * * mozilla + * * IsInvalidChar( http://mxr.mozilla.org/mozilla-central/source/gfx/thebes/gfxTextRun.cpp#1973 ) + * * IsBidiControl( http://mxr.mozilla.org/mozilla-central/source/intl/unicharutil/util/nsBidiUtils.h#114 ) + * * Character encodings in HTML ( http://en.wikipedia.org/wiki/Character_encodings_in_HTML#HTML_character_references ) + * * CSS Text Spec ( http://dev.w3.org/csswg/css-text/ ) + * * unicode table ( http://unicode-table.com ) + * + * TODO Web specs? IE? + * + */ +inline bool is_illegal_unicode(Unicode c) +{ + return (c < 0x20) || (c >= 0x7F && c <= 0xA0) || (c == 0xAD) + || (c == 0x061C) || (c == 0x1361) + || (c >= 0x200B && c <= 0x200F) || (c == 0x2028) || (c == 0x2029) + || (c >= 0x202A && c <= 0x202E) || (c >= 0x2066 && c <= 0x2069) + || (c >= 0xD800 && c <= 0xDFFF) || (c == 0xFEFF) || (c == 0xFFFC) + || (c == 0xFFFE) || (c == 0xFFFF); +} + +Unicode map_to_private(CharCode code); + +/* * Try to determine the Unicode value directly from the information in the font */ +Unicode unicode_from_font (CharCode code, GfxFont * font); + +/* + * We have to use a single Unicode value to reencode fonts + * if we got multi-unicode values, it might be expanded ligature, try to restore it + * if we cannot figure it out at the end, use a private mapping + */ +Unicode check_unicode(Unicode * u, int len, CharCode code, GfxFont * font); + + +} // namespace pdf2htmlEX + +#endif //UNICODE_H__ |