#!/bin/sh while [ $# -gt 0 ] do case "$1" in -path ) shift ;; -* ) exec >&2 echo "$0: Unrecognized option $1" exit 1 ;; * ) break ;; esac done if [ "$#" -gt 0 ] then target="$1" MASTER=$(cd "$target" && pwd) CONFIG=${MASTER} shift else if [ -z "${EDIRECT_PUBMED_MASTER}" ] then echo "Must supply path to master archive area or set EDIRECT_PUBMED_MASTER environment variable" exit 1 else MASTER="${EDIRECT_PUBMED_MASTER}" MASTER=${MASTER%/} fi fi while [ $# -gt 0 ] do case "$1" in -temp | -work | -working ) shift ;; -* ) exec >&2 echo "$0: Unrecognized option $1" exit 1 ;; * ) break ;; esac done if [ "$#" -gt 0 ] then working="$1" WORKING=$(cd "$working" && pwd) shift else if [ -z "${EDIRECT_PUBMED_WORKING}" ] then WORKING=${MASTER} else WORKING="${EDIRECT_PUBMED_WORKING}" WORKING=${WORKING%/} fi fi echo "MASTER $MASTER" echo "WORKING $WORKING" for dir in Archive Postings do mkdir -p "$MASTER/$dir" done for dir in Current Data Indexed Inverted Merged Pubmed do mkdir -p "$WORKING/$dir" done date seconds_start=$(date "+%s") cd "$WORKING/Data" echo "Downloading GeneRIFs" download-ncbi-data generifs echo "Downloading MeSH Tree" download-ncbi-data meshtree echo "Downloading Theme Data" for bs in \ "chemical-disease" \ "chemical-gene" \ "gene-disease" \ "gene-gene" do one=$(echo "part-i-$bs-path-theme-distributions.txt") if [ ! -f "$one" ] then if [ ! -f "$one.gz" ] then echo "part-i-$bs" curl -s "https://zenodo.org/record/3459420/files/$one.gz?download=1" > $one.gz fi gunzip -q "$one.gz" fi two=$(echo "part-ii-dependency-paths-$bs-sorted-with-themes.txt") if [ ! -f "$two" ] then if [ ! -f "$two.gz" ] then echo "part-ii-$bs" curl -s "https://zenodo.org/record/3459420/files/$two.gz?download=1" > $two.gz fi gunzip -q "$two.gz" fi done seconds_end=$(date "+%s") seconds=$((seconds_end - seconds_start)) echo "$seconds seconds" DWN=$seconds seconds_start=$(date "+%s") echo "Removing Previous Indices" cd "$WORKING/Indexed" target="$WORKING/Indexed" find "$target" -name "*.e2x" -delete find "$target" -name "*.e2x.gz" -delete cd "$WORKING/Inverted" target="$WORKING/Inverted" find "$target" -name "*.inv" -delete find "$target" -name "*.inv.gz" -delete cd "$WORKING/Merged" target="$WORKING/Merged" find "$target" -name "*.mrg" -delete find "$target" -name "*.mrg.gz" -delete seconds_end=$(date "+%s") seconds=$((seconds_end - seconds_start)) echo "$seconds seconds" CLR=$seconds seconds_start=$(date "+%s") cd "$WORKING/Indexed" target="$WORKING/Indexed" find "$target" -name "*.e2x" -delete find "$target" -name "*.e2x.gz" -delete echo "Indexing Themes" for bs in \ "chemical-disease" \ "chemical-gene" \ "gene-disease" \ "gene-gene" do one=$(echo "$WORKING/Data/part-i-$bs-path-theme-distributions.txt") two=$(echo "$WORKING/Data/part-ii-dependency-paths-$bs-sorted-with-themes.txt") echo "$bs" | tr '[:lower:]' '[:upper:]' | while IFS=$'-' read fs sc do fs=${fs:0:2} sc=${sc:0:2} thr=$(echo "$fs$sc") rchive -themes "$one" "$two" "$thr" done done > "$target/themes.tbl" for fld in THME CONV do tag=$(echo "$fld" | tr '[:upper:]' '[:lower:]') cat "$target/themes.tbl" | grep -w "$fld" | rchive -gzip -thesis 5000000 "$target" "$tag" done rm "$target/themes.tbl" seconds_end=$(date "+%s") seconds=$((seconds_end - seconds_start)) echo "$seconds seconds" IDX=$seconds seconds_start=$(date "+%s") cd "$WORKING/Indexed" echo "Inverting Extra Indices" target="$WORKING/Inverted" find "$target" -name "*.inv" -delete find "$target" -name "*.inv.gz" -delete for fl in *.e2x.gz do base=${fl%.e2x.gz} echo "$base.inv" gunzip -c "$fl" | rchive -invert | gzip -1 > "$target/$base.inv.gz" sleep 1 done seconds_end=$(date "+%s") seconds=$((seconds_end - seconds_start)) echo "$seconds seconds" INV=$seconds seconds_start=$(date "+%s") cd "$WORKING/Inverted" echo "Merging Extra Indices" target="$WORKING/Merged" find "$target" -name "*.mrg" -delete find "$target" -name "*.mrg.gz" -delete osname=`uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/'` if [ "$osname" = "CYGWIN_NT" -a -x /bin/cygpath ] then target=`cygpath -w "$target"` fi target=${target%/} rchive -gzip -merge "$target" *.inv.gz seconds_end=$(date "+%s") seconds=$((seconds_end - seconds_start)) echo "$seconds seconds" MRG=$seconds seconds_start=$(date "+%s") cd "$WORKING/Merged" echo "Producing Extra Postings" target="$WORKING/Postings" osname=`uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/'` if [ "$osname" = "CYGWIN_NT" -a -x /bin/cygpath ] then target=`cygpath -w "$target"` fi target=${target%/} for fld in THME CONV do echo "$fld" find "." -name "*.mrg.gz" | sort | xargs -n 100 echo | while read files do rchive -promote "$target" "$fld" $files done done seconds_end=$(date "+%s") seconds=$((seconds_end - seconds_start)) echo "$seconds seconds" PST=$seconds seconds_start=$(date "+%s") cd "$WORKING/Indexed" # echo "Populating Link Archive" # rchive -distribute "$MASTER/Archive" *.e2x.gz seconds_end=$(date "+%s") seconds=$((seconds_end - seconds_start)) # echo "$seconds seconds" POP=$seconds echo "DWN $DWN seconds" echo "CLR $CLR seconds" echo "IDX $IDX seconds" echo "INV $INV seconds" echo "MRG $MRG seconds" echo "PST $PST seconds" # echo "POP $POP seconds" echo "" date