Difference between revisions of "QLever/script"

From BITPlan Wiki
Jump to navigation Jump to search
 
(12 intermediate revisions by the same user not shown)
Line 1: Line 1:
 +
__TOC__
 +
The qleverauto script below is for automation only now.
 +
See https://github.com/ad-freiburg/qlever-control for the "official" script for indexing and starting.
 +
 
This is a script for getting started with {{Link|target=QLever}} along the lines of the [https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md Quickstart] description
 
This is a script for getting started with {{Link|target=QLever}} along the lines of the [https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md Quickstart] description
  
Line 10: Line 14:
 
|state=open
 
|state=open
 
}}
 
}}
 +
 +
This script has been renamed to "qleverauto" from "qlever" on 2022-05-23 since "qlever" is now the name of the official qlever-control script provided with https://github.com/ad-freiburg/qlever-control
 
= usage =
 
= usage =
 
<source lang='bash'>
 
<source lang='bash'>
usage: ./qlever [-h|--help]
+
/qleverauto -h
 +
usage: ./qleverauto [-h|--help|...]
 
   -h|--help: show this usage
 
   -h|--help: show this usage
   -aw|--all_wikidata: run all steps for wikidata
+
   -aw|--all_wikidata: run all steps for wikidata version,env,pull,download and index
 
   -b|--build: build qlever docker image
 
   -b|--build: build qlever docker image
 +
  -p|--pull: pull qlever docker image
 +
  --port <port> port to server endpoint from, default: 7001
 +
  -s|--server: start SPARQL server
 
   -c|--clone: clone qlever
 
   -c|--clone: clone qlever
   -e|--env: show environment
+
   -e|--env: show, check and modify environment
 +
  -v|--version: show version of this script
 
   -wd|--wikidata_download: download wikidata data dump
 
   -wd|--wikidata_download: download wikidata data dump
 +
  -wi|--wikidata_index: build the index for the  wikidata data dump
  
 
This helper script simplifies the access to the steps outlined in https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md
 
This helper script simplifies the access to the steps outlined in https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md
 +
 
</source>
 
</source>
  
= qlever =
+
= qleverauto =
 
<source lang='bash'>
 
<source lang='bash'>
 
#!/bin/bash
 
#!/bin/bash
 
#
 
#
# a script for getting started with QLever
+
# a script for getting started with QLever and automatic tasks for
 +
# it
 
#
 
#
 
# see https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md
 
# see https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md
 
# see https://wiki.bitplan.com/index.php/QLever
 
# see https://wiki.bitplan.com/index.php/QLever
 +
# see https://github.com/ad-freiburg/qlever-control for the
 +
#  official qlever control script
 
#
 
#
 
#
 
#
Line 38: Line 54:
 
# we assume the script is started from the QLEVER_HOME directory
 
# we assume the script is started from the QLEVER_HOME directory
 
export QLEVER_HOME=$(pwd)
 
export QLEVER_HOME=$(pwd)
entrypoint="qlever"
+
dockerimage="qlever"
 +
port=7001
 +
version="$Revision: 1.29 $"
 +
versionDate="$Date: 2022/05/23 06:15:28 $"
  
 
startTime=0
 
startTime=0
Line 80: Line 99:
 
#
 
#
 
usage() {
 
usage() {
   echo "usage: $0 [-h|--help]"
+
   echo "usage: $0 [-h|--help|...]"
 
   echo "  -h|--help: show this usage"
 
   echo "  -h|--help: show this usage"
   echo "  -aw|--all_wikidata: run all steps for wikidata env,pull,download and index"
+
   echo "  -aw|--all_wikidata: run all steps for wikidata version,env,pull,download and index"
 
   echo "  -b|--build: build qlever docker image"
 
   echo "  -b|--build: build qlever docker image"
 
   echo "  -p|--pull: pull qlever docker image"
 
   echo "  -p|--pull: pull qlever docker image"
 +
  echo "  --port <port> port to server endpoint from, default: $port"
 +
  echo "  -s|--server: start SPARQL server"
 
   echo "  -c|--clone: clone qlever"
 
   echo "  -c|--clone: clone qlever"
   echo "  -e|--env: show environment"
+
   echo "  -e|--env: show, check and modify environment"
 +
  echo "  -v|--version: show version of this script"
 
   echo "  -wd|--wikidata_download: download wikidata data dump"
 
   echo "  -wd|--wikidata_download: download wikidata data dump"
   echo "  -wi|--wikidata_index: download wikidata data dump"
+
   echo "  -wi|--wikidata_index: build the index for the  wikidata data dump"
 
   echo ""
 
   echo ""
 
   echo "This helper script simplifies the access to the steps outlined in https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md"
 
   echo "This helper script simplifies the access to the steps outlined in https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md"
Line 112: Line 134:
 
   esac
 
   esac
 
   color_msg $blue "$l_action $l_state at $now$after"
 
   color_msg $blue "$l_action $l_state at $now$after"
 +
}
 +
 +
#
 +
# show the version of this script
 +
#
 +
show_version() {
 +
  local l_script=$(basename $0)
 +
  color_msg $blue "$l_script version $version $versionDate"
 +
}
 +
 +
#
 +
# check whether program is installed
 +
#
 +
#  #1: l_prog - the program to check
 +
#
 +
check_installed() {
 +
  local l_prog="$1"
 +
  local l_installed="✅"
 +
  local l_color=$green
 +
  local l_progbin=$(which $l_prog)
 +
  which $l_prog > /dev/null
 +
  if [ $? -ne 0 ]
 +
  then
 +
    l_installed="❌"
 +
    l_color=$red
 +
  fi
 +
  color_msg $l_color "$l_prog → $l_progbin $l_installed"
 
}
 
}
  
Line 118: Line 167:
 
#
 
#
 
show_env() {
 
show_env() {
 +
  local l_progs="docker top df jq"
 +
  case $(uname -a) in
 +
    Darwin*)
 +
      l_progs="$l_progs sw_vers"
 +
      ;;
 +
    *)
 +
      l_progs="$l_progs lsb_release free"
 +
    ;;
 +
  esac
 +
  color_msg $blue "needed software"
 +
  for l_prog in $l_progs
 +
  do
 +
    check_installed $l_prog
 +
  done
 
   color_msg $blue "operating system"
 
   color_msg $blue "operating system"
 
   local l_disks="/dev/s"
 
   local l_disks="/dev/s"
Line 142: Line 205:
 
   color_msg $blue "soft ulimit for files"
 
   color_msg $blue "soft ulimit for files"
 
   ulimit -Sn
 
   ulimit -Sn
 +
}
 +
 +
#
 +
# check whether the given process runs and kill it if yes
 +
# param #1: l_process  - the process to check
 +
#
 +
killIfRunning() {
 +
  local l_process="$1"
 +
  pgrep -fl "$l_process"
 +
  local l_presult=$?
 +
  if [ $l_presult -eq 0 ]
 +
  then
 +
    color_msg "$l_process already running"
 +
    # comment out as you like
 +
    # either stop here
 +
    #echo "to kill the process you might want to use"
 +
    #echo "pkill -f $l_process"
 +
    #exit 1
 +
    # or fully automatic kill
 +
    color_msg "killing $l_process"
 +
    pkill -f "$l_process"
 +
  fi
 +
}
 +
#
 +
# kill a running qlever process
 +
#
 +
qlever_kill() {
 +
  killIfRunning qlever
 
}
 
}
  
Line 148: Line 239:
 
#
 
#
 
qlever_pull() {
 
qlever_pull() {
 +
  dockerimage="adfreiburg/qlever"
 
   show_timing "pulling qlever docker image" "started"
 
   show_timing "pulling qlever docker image" "started"
   docker pull adfreiburg/qlever
+
   docker pull $dockerimage
 
   show_timing "pulling qlever docker image" "finished"
 
   show_timing "pulling qlever docker image" "finished"
   entrypoint="adfreiburg/qlever"
+
}
 +
 
 +
#
 +
# start the SPARQL server
 +
#
 +
qlever_start() {
 +
   local l_port="$1"
 +
  docker run --rm -v $QLEVER_HOME/qlever-indices/wikidata:/index \
 +
      -p $l_port:7001 -e INDEX_PREFIX=wikidata --name qlever.wikidata $dockerimage
 
}
 
}
  
Line 194: Line 294:
 
#  1: title of the download
 
#  1: title of the download
 
#  2: expected time
 
#  2: expected time
#  3: url to download from
+
#  3: target directory
 +
#  4: file expected
 +
#  5: url to download from
 
#
 
#
 
download() {
 
download() {
 
   local l_title="$1"
 
   local l_title="$1"
 
   local l_expected="$2"
 
   local l_expected="$2"
   local l_url="$3"
+
  local l_target="$3"
   color_msg $blue "downloading $l_title ... please wait typically $l_expected ..."
+
  local l_file="$4"
  show_timing "$l_title download" "started"
+
   local l_url="$5"
  wget $l_url
+
  # check if file already exists
  show_timing "$l_title download" "finished"
+
  cd $l_target
 +
  if [ -f $l_file ]
 +
  then
 +
    color_msg $green "$l_title:$l_file already downloaded"
 +
   else
 +
    color_msg $blue "downloading $l_title:$l_file ... please wait typically $l_expected ..."
 +
    show_timing "$l_title download" "started"
 +
    wget $l_url
 +
    show_timing "$l_title download" "finished"
 +
  fi
 
}
 
}
  
 
#
 
#
# wikidata download
+
# wikidata config copy
 
#
 
#
wikidata_download() {
+
wikidata_copyconfig() {
 
   cd $QLEVER_HOME
 
   cd $QLEVER_HOME
 
   target=qlever-indices/wikidata
 
   target=qlever-indices/wikidata
 
   if [ ! -d $target ]
 
   if [ ! -d $target ]
 
   then
 
   then
 +
    color_msg $blue "creating $target"
 
     mkdir -p $target
 
     mkdir -p $target
    cd $target
 
    config=$QLEVER_HOME/qlever-code/examples/wikidata.settings.json
 
    color_msg $blue "copying config file $config"
 
    cp -p $config .
 
    download "wikidata lexemes" "3min" https://dumps.wikimedia.org/wikidatawiki/entities/latest-lexemes.ttl.bz2
 
    download "wikidata dump" "6hours" https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.ttl.bz2;date
 
 
   else
 
   else
     color_msg $green "wikidata dump already downloaded"
+
     color_msg $green "$target already exists"
 +
  fi
 +
  cd $target
 +
  config=wikidata.settings.json
 +
  configpath=$QLEVER_HOME/qlever-code/examples/$config
 +
  if [ ! -f $config ]
 +
  then
 +
    color_msg $blue "copying config file $configpath to $target"
 +
    cp -p $configpath .
 +
  else
 +
    color_msg $green "$config already copied to $target"
 
   fi
 
   fi
 +
}
 +
 +
#
 +
# wikidata download
 +
#
 +
wikidata_download() {
 +
  local l_base=https://dumps.wikimedia.org/wikidatawiki/entities/
 +
  local l_dump=latest-all.ttl.bz2
 +
  local l_lexemes=latest-lexemes.ttl.bz2
 +
  #wikidata_copyconfig
 +
  target=$QLEVER_HOME/wikidata
 +
  download "wikidata lexemes" "3min" $target $l_lexemes $l_base/$l_lexemes
 +
  download "wikidata dump" "6hours" $target $l_dump $l_base/$l_dump
 
}
 
}
  
Line 230: Line 359:
 
#
 
#
 
wikidata_index() {
 
wikidata_index() {
   cd $QLEVER_HOME
+
   cd $QLEVER_HOME/wikidata
 
   chmod o+w .
 
   chmod o+w .
 
   show_timing "creating wikidata index" "started"
 
   show_timing "creating wikidata index" "started"
  docker run -i --rm -v $QLEVER_HOME/qlever-indices/wikidata:/index --entrypoint bash $entrypoint -c "cd /index && bzcat latest-all.ttl.bz2 latest-lexemes.ttl.bz2 | IndexBuilderMain -F ttl -f - -l -i wikidata -s wikidata.settings.json | tee wikidata.index-log.txt"
+
docker run -i --rm -v $QLEVER_HOME/qlever-indices/wikidata:/index --entrypoint bash $dockerimage -c "cd /index && bzcat latest-all.ttl.bz2 latest-lexemes.ttl.bz2 | IndexBuilderMain -F ttl -f - -l -i wikidata -s wikidata.settings.json | tee wikidata.index-log.txt"
 +
  . ../qlever-control/qlever
 +
  check_installed IndexBuilderMain
 +
  qlever index
 
   show_timing "creating wikidata index" "finished"
 
   show_timing "creating wikidata index" "finished"
 
}
 
}
Line 246: Line 378:
 
       usage;;
 
       usage;;
 
     -aw|--all_wikidata)
 
     -aw|--all_wikidata)
 +
      show_version
 
       show_env
 
       show_env
       #qlever_clone
+
       qlever_clone
 
       #qlever_build
 
       #qlever_build
 
       qlever_pull
 
       qlever_pull
Line 261: Line 394:
 
     -e|--env)
 
     -e|--env)
 
       show_env
 
       show_env
 +
      ;;
 +
    -k|--kill)
 +
      qlever_kill
 
       ;;
 
       ;;
 
     -p|--pull)
 
     -p|--pull)
 
       qlever_pull
 
       qlever_pull
 +
      ;;
 +
    --port)
 +
      if [ $# -lt 1 ]
 +
      then
 +
        usage
 +
      fi
 +
      port=$1
 +
      shift
 +
      ;;
 +
    -s|--server)
 +
      qlever_start $port
 
       ;;
 
       ;;
 
     -wd|--wikidata_download)
 
     -wd|--wikidata_download)
 
       wikidata_download
 
       wikidata_download
 
       ;;
 
       ;;
     -wi|--wikidata-index)
+
     -wi|--wikidata_index)
 
       wikidata_index
 
       wikidata_index
 +
      ;;
 +
    -v|--version)
 +
      show_version
 
       ;;
 
       ;;
 
     -t)
 
     -t)
Line 278: Line 428:
 
   esac
 
   esac
 
done
 
done
 +
</source>
 +
= logstats =
 +
script to summarize progress and statistics from qlever index log. Please note that this script uses german locale for the
 +
Spreadsheet commands such as "RUNDEN". It's unfortunate that spreadsheet programs such as Excel and Numbers use this locale specific naming by default.
 +
<source lang='bash'>
 +
#!/bin/bash
 +
# WF 2022-03-12
 +
# get the relevant log information for the indexer
 +
# $Header: /hd/seel/qlever/RCS/logstats,v 1.2 2022/05/23 06:24:23 wf Exp wf $
 +
 +
logfile=wikidata/wikidata.index-log.txt
 +
echo 'day;time;phase;mill triples;duration;mill triples/h;todo;ETA h' > stats.csv
 +
cat $logfile \
 +
| sed 's/ /;/g' \
 +
| sed 's/ -//g' \
 +
| sed 's/,//g' \
 +
| sed 's/\.[[:digit:]]\+//g' \
 +
| awk -v expectedTriples=17400 -v expectedBoM=900 -v expectedUoM=3200 -v expectedConversion=27300 -v expectedWords=900 '
 +
BEGIN {
 +
  # Field separator
 +
  FS=";"
 +
# double quote
 +
  quote="\x22"
 +
}
 +
# default extraction from line
 +
# 2022-05-22 17:48:22.564 ...
 +
{
 +
  #print $0
 +
  date=$1
 +
  time=$2
 +
}
 +
# start of Processing phase
 +
# 2022-05-22 17:48:22.564 - INFO:  Processing input triples from /dev/stdin ...
 +
/Processing;input;triples;from/ {
 +
  phase="Processing"
 +
  printStartPhase(date,time,phase,expectedTriples)
 +
  row=3
 +
  next
 +
}
 +
# while processing
 +
# 2022-05-23 00:09:50.846 - INFO:  Input triples processed: 17,400,000,000
 +
/Input;triples;processed:;/{
 +
  triples=$8
 +
  next
 +
}
 +
# Start of byte order Merging
 +
# 2022-05-23 00:10:52.614 - INFO:  Merging partial vocabularies in byte order (internal only) ...
 +
/Merging;partial;vocabularies;in;byte;order/ {
 +
  printrow(date,time,triples,row,phase)
 +
  phase="Byte order merging"
 +
  printStartPhase(date,time,phase,expectedBoM)
 +
  row=5
 +
  next
 +
}
 +
/Words;merged:;/ {
 +
  triples=$7
 +
next
 +
}
 +
/Words;processed:;/ {
 +
  triples=$7
 +
next
 +
}
 +
/Merging;partial;vocabularies;in;Unicode;order/ {
 +
  printrow(date,time,triples,row,phase)
 +
  phase="Unicode order merging"
 +
  printStartPhase(date,time,phase,expectedUoM)
 +
  row=7
 +
next
 +
}
 +
/Converting;triples;from;local;/ {
 +
  printrow(date,time,triples,row,phase)
 +
  phase="Triple conversion"
 +
  printStartPhase(date,time,phase,expectedConversion)
 +
  row=9
 +
}
 +
/Triples;converted:;/ {
 +
  triples=$7
 +
next
 +
}
 +
/Building;/ {
 +
printrow(date,time,triples,row,phase)
 +
phase="Prefix tree"
 +
printStartPhase(date,time,phase,expectedWords)
 +
row=11
 +
next
 +
}
 +
/Computing;maximally/ {
 +
printrow(date,time,triples,row,phase)
 +
phase="Compressing prefixes"
 +
printStartPhase(date,time,phase,expectedTriples)
 +
row=13
 +
triples=0
 +
next
 +
}
 +
/Writing;compressed;vocabulary/ {
 +
printrow(date,time,triples,row,phase)
 +
phase="PSO/POS index pair"
 +
printStartPhase(date,time,phase,expectedTriples)
 +
row=15
 +
triples=0
 +
next
 +
}
 +
/Writing;meta;data;for;PSO/ {
 +
printrow(date,time,triples,row,phase)
 +
phase="SPO/SOP index pair"
 +
printStartPhase(date,time,phase,expectedTriples)
 +
row=17
 +
triples=0
 +
next
 +
}
 +
/Writing;meta;data;for;SPO/ {
 +
printrow(date,time,triples,row,phase)
 +
phase="new index pair"
 +
printStartPhase(date,time,phase,expectedTriples)
 +
row=19
 +
triples=0
 +
next
 +
}
 +
function printStartPhase(date,time,phase,expected) {
 +
  printf("%s;%s;%s;%d;;;\n",date,time,phase,expected)
 +
}
 +
function printrow(date,time,triples,row,phase) {
 +
  printf("%s;%s;%s;%s;=(A%d+B%d)-(A%d+B%d);%s=Runden(D%d/E%d;0)%s;=D%d-D%d;%s=Runden(G%d/F%d;1)%s\n",date,time,phase,triples/1000000,row,row,row-1,row-1,quote,row,row,quote,row-1,row,quote,row,row,quote)
 +
}
 +
END {
 +
  printrow(date,time,triples,row,phase)
 +
  printf(";;total;;=SUMME(E$2:E%d)\n",row)
 +
}
 +
' >> stats.csv
 +
cat stats.csv
 +
# open in spreadsheet
 +
open stats.csv
 
</source>
 
</source>

Latest revision as of 07:26, 23 May 2022

The qleverauto script below is for automation only now. See https://github.com/ad-freiburg/qlever-control for the "official" script for indexing and starting.

This is a script for getting started with QLever along the lines of the Quickstart description

see

This script has been renamed to "qleverauto" from "qlever" on 2022-05-23 since "qlever" is now the name of the official qlever-control script provided with https://github.com/ad-freiburg/qlever-control

usage

/qleverauto -h
usage: ./qleverauto [-h|--help|...]
  -h|--help: show this usage
  -aw|--all_wikidata: run all steps for wikidata version,env,pull,download and index
  -b|--build: build qlever docker image
  -p|--pull: pull qlever docker image
  --port <port> port to server endpoint from, default: 7001
  -s|--server: start SPARQL server
  -c|--clone: clone qlever
  -e|--env: show, check and modify environment
  -v|--version: show version of this script
  -wd|--wikidata_download: download wikidata data dump
  -wi|--wikidata_index: build the index for the  wikidata data dump

This helper script simplifies the access to the steps outlined in https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md

qleverauto

#!/bin/bash
#
# a script for getting started with QLever and automatic tasks for
# it 
#
# see https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md
# see https://wiki.bitplan.com/index.php/QLever
# see https://github.com/ad-freiburg/qlever-control for the
#   official qlever control script
#
#
# WF 2022-01-28
#

# we assume the script is started from the QLEVER_HOME directory
export QLEVER_HOME=$(pwd)
dockerimage="qlever"
port=7001
version="$Revision: 1.29 $"
versionDate="$Date: 2022/05/23 06:15:28 $"

startTime=0
finishTime=0

#ansi colors
#http://www.csc.uvic.ca/~sae/seng265/fall04/tips/s265s047-tips/bash-using-colors.html
blue='\033[0;34m'
red='\033[0;31m'
green='\033[0;32m' # '\e[1;32m' is too bright for white bg.
endColor='\033[0m'

#
# a colored message
#   params:
#     1: l_color - the color of the message
#     2: l_msg - the message to display
#
color_msg() {
  local l_color="$1"
  local l_msg="$2"
  echo -e "${l_color}$l_msg${endColor}"
}

#
# error
#
#   show an error message and exit
#
#   params:
#     1: l_msg - the message to display
error() {
  local l_msg="$1"
  # use ansi red for error
  color_msg $red "Error: $l_msg" 1>&2
  exit 1
}

#
# show the usage
#
usage() {
  echo "usage: $0 [-h|--help|...]"
  echo "  -h|--help: show this usage"
  echo "  -aw|--all_wikidata: run all steps for wikidata version,env,pull,download and index"
  echo "  -b|--build: build qlever docker image"
  echo "  -p|--pull: pull qlever docker image"
  echo "  --port <port> port to server endpoint from, default: $port"
  echo "  -s|--server: start SPARQL server"
  echo "  -c|--clone: clone qlever"
  echo "  -e|--env: show, check and modify environment"
  echo "  -v|--version: show version of this script"
  echo "  -wd|--wikidata_download: download wikidata data dump"
  echo "  -wi|--wikidata_index: build the index for the  wikidata data dump"
  echo ""
  echo "This helper script simplifies the access to the steps outlined in https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md"
}

#
# show the start of an action
#
show_timing() {
  local l_action="$1"
  local l_state="$2"
  now=$(date)
  case $l_state in
    started)
       startTime=$SECONDS
       after=""
       ;;
    finished)
      finishTime=$SECONDS
      local l_duration=$(( $finishTime - $startTime ))
      after=" after $l_duration seconds"
      ;;
  esac
  color_msg $blue "$l_action $l_state at $now$after"
}

#
# show the version of this script
#
show_version() {
  local l_script=$(basename $0)
  color_msg $blue "$l_script version $version $versionDate"
}

#
# check whether program is installed
#
#  #1: l_prog - the program to check
#
check_installed() {
  local l_prog="$1"
  local l_installed="✅"
  local l_color=$green
  local l_progbin=$(which $l_prog)
  which $l_prog > /dev/null 
  if [ $? -ne 0 ]
  then
    l_installed="❌"
    l_color=$red
  fi
  color_msg $l_color "$l_prog$l_progbin $l_installed"
}

#
# show and modify the environment
#
show_env() {
  local l_progs="docker top df jq"
  case $(uname -a) in
    Darwin*)
      l_progs="$l_progs sw_vers"
      ;;
    *) 
      l_progs="$l_progs lsb_release free"
     ;;
  esac
  color_msg $blue "needed software"
  for l_prog in $l_progs
  do
    check_installed $l_prog
  done
  color_msg $blue "operating system"
  local l_disks="/dev/s"
  case $(uname -a) in
    Darwin*)
      l_disks="/dev/disk"
      sw_vers;;
    *)
      lsb_release -a
  esac
  color_msg $blue "docker version"
  docker --version
  color_msg $blue "memory"
  case $(uname -a) in
    Darwin*)
      top -l 1 | grep PhysMem | cut -f1,2 -d" "
      ;;
    *) free -h
      ;;
  esac
  color_msg $blue "diskspace"
  df -h | grep $l_disks
  ulimit -Sn 1048576
  color_msg $blue "soft ulimit for files"
  ulimit -Sn
}

#
# check whether the given process runs and kill it if yes
# param #1: l_process  - the process to check
#
killIfRunning() {
  local l_process="$1"
  pgrep -fl "$l_process"
  local l_presult=$?
  if [ $l_presult -eq 0 ]
  then
    color_msg "$l_process already running"
    # comment out as you like
    # either stop here
    #echo "to kill the process you might want to use"
    #echo "pkill -f $l_process"
    #exit 1
    # or fully automatic kill
    color_msg "killing $l_process"
    pkill -f "$l_process"
  fi
}
#
# kill a running qlever process
# 
qlever_kill() {
  killIfRunning qlever
}

#
# pull the qlever image
#
qlever_pull() {
  dockerimage="adfreiburg/qlever"
  show_timing "pulling qlever docker image" "started"
  docker pull $dockerimage
  show_timing "pulling qlever docker image" "finished"
}

#
# start the SPARQL server
#
qlever_start() {
  local l_port="$1"
  docker run --rm -v $QLEVER_HOME/qlever-indices/wikidata:/index \
      -p $l_port:7001 -e INDEX_PREFIX=wikidata --name qlever.wikidata $dockerimage 
}

#
# clone the qlever code
#
qlever_clone() {
  cd $QLEVER_HOME
  if [ ! -d qlever-code ]
  then
    color_msg $blue "cloning qlever - please wait typically 1 min ..."
    show_timing "cloning qlever" "started"
    git clone --recursive https://github.com/ad-freiburg/qlever qlever-code
    show_timing "cloning qlever" "finished"
  else
    color_msg $green "clone of clever-code already available"
  fi
}

#
# build the docker image
#
qlever_build() {
  #docker images | grep qlever
  #if [ $? -ne 0 ]
  #then
     cd $QLEVER_HOME/qlever-code
     color_msg $blue "building qlever - please wait typically 15 min ..."
     show_timing "docker build" "started"
     # docker build -t qlever .
     docker build --file Dockerfiles/Dockerfile.Ubuntu20.04 -t qlever .
     show_timing "docker build" "finished"
  #else
  #  color_msg $green "qlever image already build"
  #fi
}

#
# generic download
#
# params
#   1: title of the download
#   2: expected time
#   3: target directory
#   4: file expected
#   5: url to download from
#
download() {
  local l_title="$1"
  local l_expected="$2"
  local l_target="$3"
  local l_file="$4"
  local l_url="$5"
  # check if file already exists
  cd $l_target
  if [ -f $l_file ]
  then 
    color_msg $green "$l_title:$l_file already downloaded"
  else 
    color_msg $blue "downloading $l_title:$l_file ... please wait typically $l_expected ..."
    show_timing "$l_title download" "started"
    wget $l_url
    show_timing "$l_title download" "finished"
  fi
}

#
# wikidata config copy 
#
wikidata_copyconfig() {
  cd $QLEVER_HOME
  target=qlever-indices/wikidata
  if [ ! -d $target ]
  then
     color_msg $blue "creating $target"
     mkdir -p $target
  else
     color_msg $green "$target already exists"
  fi
  cd $target
  config=wikidata.settings.json
  configpath=$QLEVER_HOME/qlever-code/examples/$config
  if [ ! -f $config ]
  then
    color_msg $blue "copying config file $configpath to $target"
    cp -p $configpath .
  else
    color_msg $green "$config already copied to $target"
  fi
}

#
# wikidata download
#
wikidata_download() {
  local l_base=https://dumps.wikimedia.org/wikidatawiki/entities/
  local l_dump=latest-all.ttl.bz2
  local l_lexemes=latest-lexemes.ttl.bz2
  #wikidata_copyconfig
  target=$QLEVER_HOME/wikidata
  download "wikidata lexemes" "3min" $target $l_lexemes $l_base/$l_lexemes
  download "wikidata dump" "6hours" $target $l_dump $l_base/$l_dump
}

#
# build the wikidata index
#
wikidata_index() {
   cd $QLEVER_HOME/wikidata
   chmod o+w .
   show_timing "creating wikidata index" "started"
#   docker run -i --rm -v $QLEVER_HOME/qlever-indices/wikidata:/index --entrypoint bash $dockerimage  -c "cd /index && bzcat latest-all.ttl.bz2 latest-lexemes.ttl.bz2 | IndexBuilderMain -F ttl -f - -l -i wikidata -s wikidata.settings.json | tee wikidata.index-log.txt"
   . ../qlever-control/qlever
   check_installed IndexBuilderMain
   qlever index
   show_timing "creating wikidata index" "finished"
}

# commandline options according to usage
while [  "$1" != ""  ]
do
  option=$1
  shift
  case $option in
    -h|--help)
       usage;;
    -aw|--all_wikidata)
      show_version
      show_env
      qlever_clone
      #qlever_build
      qlever_pull
      wikidata_download
      wikidata_index
      ;;
    -b|--build)
       qlever_build
       ;;
    -c|--clone)
       qlever_clone
       ;;
    -e|--env)
       show_env
       ;;
    -k|--kill)
       qlever_kill
       ;;
    -p|--pull)
       qlever_pull
       ;;
    --port)
       if [ $# -lt 1 ]
       then
         usage
       fi
       port=$1
       shift
       ;;
    -s|--server)
       qlever_start $port
       ;;
    -wd|--wikidata_download)
       wikidata_download
       ;;
    -wi|--wikidata_index)
       wikidata_index
       ;;
    -v|--version)
       show_version
       ;;
    -t)
      show_timing "testing" "started"
      sleep 2
      show_timing "testing" "finished"
     ;;
  esac
done

logstats

script to summarize progress and statistics from qlever index log. Please note that this script uses german locale for the Spreadsheet commands such as "RUNDEN". It's unfortunate that spreadsheet programs such as Excel and Numbers use this locale specific naming by default.

#!/bin/bash
# WF 2022-03-12
# get the relevant log information for the indexer
# $Header: /hd/seel/qlever/RCS/logstats,v 1.2 2022/05/23 06:24:23 wf Exp wf $

logfile=wikidata/wikidata.index-log.txt
echo 'day;time;phase;mill triples;duration;mill triples/h;todo;ETA h' > stats.csv
cat $logfile \
	| sed 's/ /;/g' \
	| sed 's/	-//g' \
	| sed 's/,//g' \
	| sed 's/\.[[:digit:]]\+//g' \
| awk -v expectedTriples=17400 -v expectedBoM=900 -v expectedUoM=3200 -v expectedConversion=27300 -v expectedWords=900 '
BEGIN {
  # Field separator
  FS=";"
	# double quote
  quote="\x22"
}
# default extraction from line
# 2022-05-22 17:48:22.564 ...
{
  #print $0
  date=$1
  time=$2
}
# start of Processing phase
# 2022-05-22 17:48:22.564	- INFO:  Processing input triples from /dev/stdin ...
/Processing;input;triples;from/ {
  phase="Processing"
  printStartPhase(date,time,phase,expectedTriples)
  row=3
  next
}
# while processing
# 2022-05-23 00:09:50.846	- INFO:  Input triples processed: 17,400,000,000
/Input;triples;processed:;/{
  triples=$8
  next
}
# Start of byte order Merging
# 2022-05-23 00:10:52.614	- INFO:  Merging partial vocabularies in byte order (internal only) ...
/Merging;partial;vocabularies;in;byte;order/ {
  printrow(date,time,triples,row,phase)
  phase="Byte order merging"
  printStartPhase(date,time,phase,expectedBoM)
  row=5
  next
}
/Words;merged:;/ {
  triples=$7
	next
}
/Words;processed:;/ {
  triples=$7
	next
}
/Merging;partial;vocabularies;in;Unicode;order/ {
  printrow(date,time,triples,row,phase)
  phase="Unicode order merging"
  printStartPhase(date,time,phase,expectedUoM)
  row=7
	next
}
/Converting;triples;from;local;/ {
  printrow(date,time,triples,row,phase)
  phase="Triple conversion"
  printStartPhase(date,time,phase,expectedConversion)
  row=9
}
/Triples;converted:;/ {
  triples=$7
	next
}
/Building;/ {
	printrow(date,time,triples,row,phase)
	phase="Prefix tree"
	printStartPhase(date,time,phase,expectedWords)
	row=11
	next
}
/Computing;maximally/ {
	printrow(date,time,triples,row,phase)
	phase="Compressing prefixes"
	printStartPhase(date,time,phase,expectedTriples)
	row=13
	triples=0
	next
}
/Writing;compressed;vocabulary/ {
	printrow(date,time,triples,row,phase)
	phase="PSO/POS index pair"
	printStartPhase(date,time,phase,expectedTriples)
	row=15
	triples=0
	next
}
/Writing;meta;data;for;PSO/ {
	printrow(date,time,triples,row,phase)
	phase="SPO/SOP index pair"
	printStartPhase(date,time,phase,expectedTriples)
	row=17
	triples=0
	next
}
/Writing;meta;data;for;SPO/ {
	printrow(date,time,triples,row,phase)
	phase="new index pair"
	printStartPhase(date,time,phase,expectedTriples)
	row=19
	triples=0
	next
}
function printStartPhase(date,time,phase,expected) {
  printf("%s;%s;%s;%d;;;\n",date,time,phase,expected)
}
function printrow(date,time,triples,row,phase) {
  printf("%s;%s;%s;%s;=(A%d+B%d)-(A%d+B%d);%s=Runden(D%d/E%d;0)%s;=D%d-D%d;%s=Runden(G%d/F%d;1)%s\n",date,time,phase,triples/1000000,row,row,row-1,row-1,quote,row,row,quote,row-1,row,quote,row,row,quote)
}
END {
  printrow(date,time,triples,row,phase)
  printf(";;total;;=SUMME(E$2:E%d)\n",row)
}
' >> stats.csv
cat stats.csv
# open in spreadsheet
open stats.csv