#!/bin/sh

while [ $# -gt 0 ]
do
  case "$1" in
    -path )
      shift
      ;;
    -* )
      exec >&2
      echo "$0: Unrecognized option $1"
      exit 1
      ;;
    * )
      break
      ;;
  esac
done

if [ "$#" -gt 0 ]
then
  target="$1"
  MASTER=$(cd "$target" && pwd)
  CONFIG=${MASTER}
  shift
else
  if [ -z "${EDIRECT_PUBMED_MASTER}" ]
  then
    echo "Must supply path to master archive area or set EDIRECT_PUBMED_MASTER environment variable"
    exit 1
  else
    MASTER="${EDIRECT_PUBMED_MASTER}"
    MASTER=${MASTER%/}
  fi
fi

while [ $# -gt 0 ]
do
  case "$1" in
    -temp | -work | -working )
      shift
      ;;
    -* )
      exec >&2
      echo "$0: Unrecognized option $1"
      exit 1
      ;;
    * )
      break
      ;;
  esac
done

if [ "$#" -gt 0 ]
then
  working="$1"
  WORKING=$(cd "$working" && pwd)
  shift
else
  if [ -z "${EDIRECT_PUBMED_WORKING}" ]
  then
    WORKING=${MASTER}
  else
    WORKING="${EDIRECT_PUBMED_WORKING}"
    WORKING=${WORKING%/}
  fi
fi

echo "MASTER $MASTER"

echo "WORKING $WORKING"

for dir in Archive Postings
do
  mkdir -p "$MASTER/$dir"
done

for dir in Current Data Indexed Inverted Merged Pubmed
do
  mkdir -p "$WORKING/$dir"
done

date

seconds_start=$(date "+%s")
echo "Downloading BioC Table"
cd "$WORKING/Data"
download-ncbi-data bioconcepts
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
DWN=$seconds

seconds_start=$(date "+%s")
echo "Indexing BioC Table"
cd "$WORKING/Indexed"
target="$WORKING/Indexed"
find "$target" -name "*.e2x" -delete
find "$target" -name "*.e2x.gz" -delete
gunzip -c "$WORKING/Data/bioconcepts2pubtatorcentral.gz" |
xtract -bioconcepts |
xtract -head "<IdxDocumentSet>" -tail "</IdxDocumentSet>" \
  -pattern IdxDocument -split 250000 -prefix "biocon" -suffix "e2x"
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
IDX=$seconds

seconds_start=$(date "+%s")
echo "Inverting BioC Indices"
cd "$WORKING/Indexed"
target="$WORKING/Inverted"
find "$target" -name "*.inv" -delete
find "$target" -name "*.inv.gz" -delete
for fl in *.e2x
do
  base=${fl%.e2x}
  echo "$base.inv"
  cat "$fl" | rchive -invert > "$target/$base.inv"
  sleep 1
done
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
INV=$seconds

seconds_start=$(date "+%s")
echo "Merging BioC Indices"
cd "$WORKING/Inverted"
target="$WORKING/Merged"
find "$target" -name "*.mrg" -delete
find "$target" -name "*.mrg.gz" -delete
osname=`uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/'`
if [ "$osname" = "CYGWIN_NT" -a -x /bin/cygpath ]
then
  target=`cygpath -w "$target"`
fi
target=${target%/}
rchive -merge "$target" *.inv
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
MRG=$seconds

seconds_start=$(date "+%s")
echo "Producing BioC Postings"
cd "$WORKING/Merged"
target="$WORKING/Postings"
osname=`uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/'`
if [ "$osname" = "CYGWIN_NT" -a -x /bin/cygpath ]
then
  target=`cygpath -w "$target"`
fi
target=${target%/}
for fld in CEBI CHEM DISZ GENE OMIM
do
  echo "$fld"
  find "." -name "*.mrg" |
  sort |
  xargs -n 100 echo |
  while read files
  do
    rchive -promote "$target" "$fld" $files
  done
done
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
PST=$seconds

echo "DWN $DWN seconds"
echo "IDX $IDX seconds"
echo "INV $INV seconds"
echo "MRG $MRG seconds"
echo "PST $PST seconds"

echo ""

date
