summaryrefslogtreecommitdiff
path: root/html2markdown
diff options
context:
space:
mode:
authorfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2006-10-17 14:22:29 +0000
committerfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2006-10-17 14:22:29 +0000
commitdf7b68225101966051f8b592a27127bf789eb81e (patch)
treea063e97ed58d0bdb2cbb5a95c3e8c1bcce54aa00 /html2markdown
parente7dbfef4d8aa528d9245424e9c372e900a774c90 (diff)
initial import
git-svn-id: https://pandoc.googlecode.com/svn/trunk@2 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'html2markdown')
-rw-r--r--html2markdown39
1 files changed, 39 insertions, 0 deletions
diff --git a/html2markdown b/html2markdown
new file mode 100644
index 000000000..3f9a4857e
--- /dev/null
+++ b/html2markdown
@@ -0,0 +1,39 @@
+#!/bin/sh -e
+# converts html to markdown
+# uses an available program to fetch URL and tidy to normalize it first
+
+[ -n "$(which pandoc)" ] || {
+ echo >&2 "You need 'pandoc' to use this program!"
+ exit 1
+}
+[ -n "$(which tidy)" ] || {
+ echo >&2 "You need 'tidy' to use this program!"
+ exit 1
+}
+
+if [ -z "$1" ] || [ -f $1 ]; then
+ tidy -utf8 $1 2>/dev/null | pandoc -r html -w markdown -s
+else
+ # Treat given argument as an URL. Locate a
+ # sensible text based browser (note the order).
+ for p in wget lynx w3m curl links w3c; do
+ if which $p >/dev/null; then
+ DUMPER=$p
+ break
+ fi
+ done
+ # Setup proper options.
+ case "$DUMPER" in
+ wget) OPT="-O-" ;;
+ lynx) OPT="-source" ;;
+ w3m) OPT="-dump_source" ;;
+ curl) OPT="" ;;
+ links) OPT="-source" ;;
+ w3c) OPT="-n -get" ;;
+ "") echo -n >&2 "Needs a program to fetch the URL "
+ echo -n >&2 "(e.g. wget, w3m, lynx, w3m or curl)."
+ exit 1 ;;
+ esac
+ # Fetch and feed to pandoc.
+ $DUMPER $OPT $1 2>/dev/null | tidy -utf8 2>/dev/null | pandoc -r html -w markdown -s
+fi