#!/bin/sh -e # converts html to markdown # uses an available program to fetch URL and tidy to normalize it first REQUIRED=tidy ### common.sh grab_url_with () { url="${1:?internal error: grab_url_with: url required}" shift cmdline="$@" prog= prog_opts= if [ -n "$cmdline" ]; then eval "set -- $cmdline" prog=$1 shift prog_opts="$@" fi if [ -z "$prog" ]; then # Locate a sensible web grabber (note the order). for p in wget lynx w3m curl links w3c; do if pathfind $p; then prog=$p break fi done [ -n "$prog" ] || { errn "$THIS: Couldn't find a program to fetch the file from URL " err "(e.g. wget, w3m, lynx, w3c, or curl)." return 1 } else pathfind "$prog" || { err "$THIS: No such web grabber '$prog' found; aborting." return 1 } fi # Setup proper base options for known grabbers. base_opts= case "$prog" in wget) base_opts="-O-" ;; lynx) base_opts="-source" ;; w3m) base_opts="-dump_source" ;; curl) base_opts="" ;; links) base_opts="-source" ;; w3c) base_opts="-n -get" ;; *) err "$THIS: unhandled web grabber '$prog'; hope it succeeds." esac err "$THIS: invoking '$prog $base_opts $prog_opts $url'..." eval "set -- $base_opts $prog_opts" $prog "$@" "$url" } encoding= grabber= nograb= while getopts e:g:nh opt; do case $opt in e) encoding="$OPTARG" ;; g) grabber="$OPTARG" ;; n) nograb=1 ;; h|?) usage "[-e encoding] [-g grabber_command] [-n] [-h] [input_file|url]" exit 2 ;; esac done shift $(($OPTIND - 1)) ### postopts.sh ### singlearg.sh inurl= if [ -n "$1" ] && ! [ -f "$1" ]; then if [ -n "$nograb" ]; then err "'$1' not found; refusing to treat input as URL." exit 1 fi # Treat given argument as an URL. inurl="$1" fi if [ -n "$inurl" ]; then err "Attempting to fetch file from '$inurl'..." ### tempdir.sh grabber_out=$THIS_TEMPDIR/grabber.out grabber_log=$THIS_TEMPDIR/grabber.log if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out \ 2>$grabber_log; then errn "grab_url_with failed" if [ -f $grabber_log ]; then err " with the following error log." err cat >&2 $grabber_log else err . fi exit 1 fi set -- $grabber_out fi if [ -z "$encoding" ] && [ "x$@" != "x" ]; then # Try to determine character encoding unless not specified # and input is STDIN. encoding=$( head "$@" | LC_ALL=C tr 'A-Z' 'a-z' | sed -ne '//dev/null | runpandoc -r html -w markdown -s | from_utf8