Latest revision as of 08:26, 23 May 2022

The qleverauto script below is for automation only now. See https://github.com/ad-freiburg/qlever-control for the "official" script for indexing and starting.

This is a script for getting started with QLever along the lines of the Quickstart description

see

Issue 562 - qlever bash script

This script has been renamed to "qleverauto" from "qlever" on 2022-05-23 since "qlever" is now the name of the official qlever-control script provided with https://github.com/ad-freiburg/qlever-control

usage

/qleverauto -h
usage: ./qleverauto [-h|--help|...]
  -h|--help: show this usage
  -aw|--all_wikidata: run all steps for wikidata version,env,pull,download and index
  -b|--build: build qlever docker image
  -p|--pull: pull qlever docker image
  --port <port> port to server endpoint from, default: 7001
  -s|--server: start SPARQL server
  -c|--clone: clone qlever
  -e|--env: show, check and modify environment
  -v|--version: show version of this script
  -wd|--wikidata_download: download wikidata data dump
  -wi|--wikidata_index: build the index for the  wikidata data dump

This helper script simplifies the access to the steps outlined in https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md

qleverauto

#!/bin/bash
#
# a script for getting started with QLever and automatic tasks for
# it 
#
# see https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md
# see https://wiki.bitplan.com/index.php/QLever
# see https://github.com/ad-freiburg/qlever-control for the
#   official qlever control script
#
#
# WF 2022-01-28
#

# we assume the script is started from the QLEVER_HOME directory
export QLEVER_HOME=$(pwd)
dockerimage="qlever"
port=7001
version="$Revision: 1.29 $"
versionDate="$Date: 2022/05/23 06:15:28 $"

startTime=0
finishTime=0

#ansi colors
#http://www.csc.uvic.ca/~sae/seng265/fall04/tips/s265s047-tips/bash-using-colors.html
blue='\033[0;34m'
red='\033[0;31m'
green='\033[0;32m' # '\e[1;32m' is too bright for white bg.
endColor='\033[0m'

#
# a colored message
#   params:
#     1: l_color - the color of the message
#     2: l_msg - the message to display
#
color_msg() {
  local l_color="$1"
  local l_msg="$2"
  echo -e "${l_color}$l_msg${endColor}"
}

#
# error
#
#   show an error message and exit
#
#   params:
#     1: l_msg - the message to display
error() {
  local l_msg="$1"
  # use ansi red for error
  color_msg $red "Error: $l_msg" 1>&2
  exit 1
}

#
# show the usage
#
usage() {
  echo "usage: $0 [-h|--help|...]"
  echo "  -h|--help: show this usage"
  echo "  -aw|--all_wikidata: run all steps for wikidata version,env,pull,download and index"
  echo "  -b|--build: build qlever docker image"
  echo "  -p|--pull: pull qlever docker image"
  echo "  --port <port> port to server endpoint from, default: $port"
  echo "  -s|--server: start SPARQL server"
  echo "  -c|--clone: clone qlever"
  echo "  -e|--env: show, check and modify environment"
  echo "  -v|--version: show version of this script"
  echo "  -wd|--wikidata_download: download wikidata data dump"
  echo "  -wi|--wikidata_index: build the index for the  wikidata data dump"
  echo ""
  echo "This helper script simplifies the access to the steps outlined in https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md"
}

#
# show the start of an action
#
show_timing() {
  local l_action="$1"
  local l_state="$2"
  now=$(date)
  case $l_state in
    started)
       startTime=$SECONDS
       after=""
       ;;
    finished)
      finishTime=$SECONDS
      local l_duration=$(( $finishTime - $startTime ))
      after=" after $l_duration seconds"
      ;;
  esac
  color_msg $blue "$l_action $l_state at $now$after"
}

#
# show the version of this script
#
show_version() {
  local l_script=$(basename $0)
  color_msg $blue "$l_script version $version $versionDate"
}

#
# check whether program is installed
#
#  #1: l_prog - the program to check
#
check_installed() {
  local l_prog="$1"
  local l_installed="✅"
  local l_color=$green
  local l_progbin=$(which $l_prog)
  which $l_prog > /dev/null 
  if [ $? -ne 0 ]
  then
    l_installed="❌"
    l_color=$red
  fi
  color_msg $l_color "$l_prog → $l_progbin $l_installed"
}

#
# show and modify the environment
#
show_env() {
  local l_progs="docker top df jq"
  case $(uname -a) in
    Darwin*)
      l_progs="$l_progs sw_vers"
      ;;
    *) 
      l_progs="$l_progs lsb_release free"
     ;;
  esac
  color_msg $blue "needed software"
  for l_prog in $l_progs
  do
    check_installed $l_prog
  done
  color_msg $blue "operating system"
  local l_disks="/dev/s"
  case $(uname -a) in
    Darwin*)
      l_disks="/dev/disk"
      sw_vers;;
    *)
      lsb_release -a
  esac
  color_msg $blue "docker version"
  docker --version
  color_msg $blue "memory"
  case $(uname -a) in
    Darwin*)
      top -l 1 | grep PhysMem | cut -f1,2 -d" "
      ;;
    *) free -h
      ;;
  esac
  color_msg $blue "diskspace"
  df -h | grep $l_disks
  ulimit -Sn 1048576
  color_msg $blue "soft ulimit for files"
  ulimit -Sn
}

#
# check whether the given process runs and kill it if yes
# param #1: l_process  - the process to check
#
killIfRunning() {
  local l_process="$1"
  pgrep -fl "$l_process"
  local l_presult=$?
  if [ $l_presult -eq 0 ]
  then
    color_msg "$l_process already running"
    # comment out as you like
    # either stop here
    #echo "to kill the process you might want to use"
    #echo "pkill -f $l_process"
    #exit 1
    # or fully automatic kill
    color_msg "killing $l_process"
    pkill -f "$l_process"
  fi
}
#
# kill a running qlever process
# 
qlever_kill() {
  killIfRunning qlever
}

#
# pull the qlever image
#
qlever_pull() {
  dockerimage="adfreiburg/qlever"
  show_timing "pulling qlever docker image" "started"
  docker pull $dockerimage
  show_timing "pulling qlever docker image" "finished"
}

#
# start the SPARQL server
#
qlever_start() {
  local l_port="$1"
  docker run --rm -v $QLEVER_HOME/qlever-indices/wikidata:/index \
      -p $l_port:7001 -e INDEX_PREFIX=wikidata --name qlever.wikidata $dockerimage 
}

#
# clone the qlever code
#
qlever_clone() {
  cd $QLEVER_HOME
  if [ ! -d qlever-code ]
  then
    color_msg $blue "cloning qlever - please wait typically 1 min ..."
    show_timing "cloning qlever" "started"
    git clone --recursive https://github.com/ad-freiburg/qlever qlever-code
    show_timing "cloning qlever" "finished"
  else
    color_msg $green "clone of clever-code already available"
  fi
}

#
# build the docker image
#
qlever_build() {
  #docker images | grep qlever
  #if [ $? -ne 0 ]
  #then
     cd $QLEVER_HOME/qlever-code
     color_msg $blue "building qlever - please wait typically 15 min ..."
     show_timing "docker build" "started"
     # docker build -t qlever .
     docker build --file Dockerfiles/Dockerfile.Ubuntu20.04 -t qlever .
     show_timing "docker build" "finished"
  #else
  #  color_msg $green "qlever image already build"
  #fi
}

#
# generic download
#
# params
#   1: title of the download
#   2: expected time
#   3: target directory
#   4: file expected
#   5: url to download from
#
download() {
  local l_title="$1"
  local l_expected="$2"
  local l_target="$3"
  local l_file="$4"
  local l_url="$5"
  # check if file already exists
  cd $l_target
  if [ -f $l_file ]
  then 
    color_msg $green "$l_title:$l_file already downloaded"
  else 
    color_msg $blue "downloading $l_title:$l_file ... please wait typically $l_expected ..."
    show_timing "$l_title download" "started"
    wget $l_url
    show_timing "$l_title download" "finished"
  fi
}

#
# wikidata config copy 
#
wikidata_copyconfig() {
  cd $QLEVER_HOME
  target=qlever-indices/wikidata
  if [ ! -d $target ]
  then
     color_msg $blue "creating $target"
     mkdir -p $target
  else
     color_msg $green "$target already exists"
  fi
  cd $target
  config=wikidata.settings.json
  configpath=$QLEVER_HOME/qlever-code/examples/$config
  if [ ! -f $config ]
  then
    color_msg $blue "copying config file $configpath to $target"
    cp -p $configpath .
  else
    color_msg $green "$config already copied to $target"
  fi
}

#
# wikidata download
#
wikidata_download() {
  local l_base=https://dumps.wikimedia.org/wikidatawiki/entities/
  local l_dump=latest-all.ttl.bz2
  local l_lexemes=latest-lexemes.ttl.bz2
  #wikidata_copyconfig
  target=$QLEVER_HOME/wikidata
  download "wikidata lexemes" "3min" $target $l_lexemes $l_base/$l_lexemes
  download "wikidata dump" "6hours" $target $l_dump $l_base/$l_dump
}

#
# build the wikidata index
#
wikidata_index() {
   cd $QLEVER_HOME/wikidata
   chmod o+w .
   show_timing "creating wikidata index" "started"
#   docker run -i --rm -v $QLEVER_HOME/qlever-indices/wikidata:/index --entrypoint bash $dockerimage  -c "cd /index && bzcat latest-all.ttl.bz2 latest-lexemes.ttl.bz2 | IndexBuilderMain -F ttl -f - -l -i wikidata -s wikidata.settings.json | tee wikidata.index-log.txt"
   . ../qlever-control/qlever
   check_installed IndexBuilderMain
   qlever index
   show_timing "creating wikidata index" "finished"
}

# commandline options according to usage
while [  "$1" != ""  ]
do
  option=$1
  shift
  case $option in
    -h|--help)
       usage;;
    -aw|--all_wikidata)
      show_version
      show_env
      qlever_clone
      #qlever_build
      qlever_pull
      wikidata_download
      wikidata_index
      ;;
    -b|--build)
       qlever_build
       ;;
    -c|--clone)
       qlever_clone
       ;;
    -e|--env)
       show_env
       ;;
    -k|--kill)
       qlever_kill
       ;;
    -p|--pull)
       qlever_pull
       ;;
    --port)
       if [ $# -lt 1 ]
       then
         usage
       fi
       port=$1
       shift
       ;;
    -s|--server)
       qlever_start $port
       ;;
    -wd|--wikidata_download)
       wikidata_download
       ;;
    -wi|--wikidata_index)
       wikidata_index
       ;;
    -v|--version)
       show_version
       ;;
    -t)
      show_timing "testing" "started"
      sleep 2
      show_timing "testing" "finished"
     ;;
  esac
done

logstats

script to summarize progress and statistics from qlever index log. Please note that this script uses german locale for the Spreadsheet commands such as "RUNDEN". It's unfortunate that spreadsheet programs such as Excel and Numbers use this locale specific naming by default.

#!/bin/bash
# WF 2022-03-12
# get the relevant log information for the indexer
# $Header: /hd/seel/qlever/RCS/logstats,v 1.2 2022/05/23 06:24:23 wf Exp wf $

logfile=wikidata/wikidata.index-log.txt
echo 'day;time;phase;mill triples;duration;mill triples/h;todo;ETA h' > stats.csv
cat $logfile \
	| sed 's/ /;/g' \
	| sed 's/	-//g' \
	| sed 's/,//g' \
	| sed 's/\.[[:digit:]]\+//g' \
| awk -v expectedTriples=17400 -v expectedBoM=900 -v expectedUoM=3200 -v expectedConversion=27300 -v expectedWords=900 '
BEGIN {
  # Field separator
  FS=";"
	# double quote
  quote="\x22"
}
# default extraction from line
# 2022-05-22 17:48:22.564 ...
{
  #print $0
  date=$1
  time=$2
}
# start of Processing phase
# 2022-05-22 17:48:22.564	- INFO:  Processing input triples from /dev/stdin ...
/Processing;input;triples;from/ {
  phase="Processing"
  printStartPhase(date,time,phase,expectedTriples)
  row=3
  next
}
# while processing
# 2022-05-23 00:09:50.846	- INFO:  Input triples processed: 17,400,000,000
/Input;triples;processed:;/{
  triples=$8
  next
}
# Start of byte order Merging
# 2022-05-23 00:10:52.614	- INFO:  Merging partial vocabularies in byte order (internal only) ...
/Merging;partial;vocabularies;in;byte;order/ {
  printrow(date,time,triples,row,phase)
  phase="Byte order merging"
  printStartPhase(date,time,phase,expectedBoM)
  row=5
  next
}
/Words;merged:;/ {
  triples=$7
	next
}
/Words;processed:;/ {
  triples=$7
	next
}
/Merging;partial;vocabularies;in;Unicode;order/ {
  printrow(date,time,triples,row,phase)
  phase="Unicode order merging"
  printStartPhase(date,time,phase,expectedUoM)
  row=7
	next
}
/Converting;triples;from;local;/ {
  printrow(date,time,triples,row,phase)
  phase="Triple conversion"
  printStartPhase(date,time,phase,expectedConversion)
  row=9
}
/Triples;converted:;/ {
  triples=$7
	next
}
/Building;/ {
	printrow(date,time,triples,row,phase)
	phase="Prefix tree"
	printStartPhase(date,time,phase,expectedWords)
	row=11
	next
}
/Computing;maximally/ {
	printrow(date,time,triples,row,phase)
	phase="Compressing prefixes"
	printStartPhase(date,time,phase,expectedTriples)
	row=13
	triples=0
	next
}
/Writing;compressed;vocabulary/ {
	printrow(date,time,triples,row,phase)
	phase="PSO/POS index pair"
	printStartPhase(date,time,phase,expectedTriples)
	row=15
	triples=0
	next
}
/Writing;meta;data;for;PSO/ {
	printrow(date,time,triples,row,phase)
	phase="SPO/SOP index pair"
	printStartPhase(date,time,phase,expectedTriples)
	row=17
	triples=0
	next
}
/Writing;meta;data;for;SPO/ {
	printrow(date,time,triples,row,phase)
	phase="new index pair"
	printStartPhase(date,time,phase,expectedTriples)
	row=19
	triples=0
	next
}
function printStartPhase(date,time,phase,expected) {
  printf("%s;%s;%s;%d;;;\n",date,time,phase,expected)
}
function printrow(date,time,triples,row,phase) {
  printf("%s;%s;%s;%s;=(A%d+B%d)-(A%d+B%d);%s=Runden(D%d/E%d;0)%s;=D%d-D%d;%s=Runden(G%d/F%d;1)%s\n",date,time,phase,triples/1000000,row,row,row-1,row-1,quote,row,row,quote,row-1,row,quote,row,row,quote)
}
END {
  printrow(date,time,triples,row,phase)
  printf(";;total;;=SUMME(E$2:E%d)\n",row)
}
' >> stats.csv
cat stats.csv
# open in spreadsheet
open stats.csv

Difference between revisions of "QLever/script"

Latest revision as of 08:26, 23 May 2022

Contents

usage

qleverauto

logstats

Navigation menu

Search

@@ Line 1: / Line 1: @@
-The script below is deprecated.
+__TOC__
-See https://github.com/ad-freiburg/qlever-control for the "official" replacement.
+The qleverauto script below is for automation only now.
+See https://github.com/ad-freiburg/qlever-control for the "official" script for indexing and starting.
 This is a script for getting started with {{Link|target=QLever}} along the lines of the [https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md Quickstart] description
@@ Line 13: / Line 14: @@
 |state=open
 }}
+This script has been renamed to "qleverauto" from "qlever" on 2022-05-23 since "qlever" is now the name of the official qlever-control script provided with https://github.com/ad-freiburg/qlever-control
 = usage =
 <source lang='bash'>
-usage: ./qlever [-h|--help|...]
+/qleverauto -h
+usage: ./qleverauto [-h|--help|...]
    -h|--help: show this usage
    -aw|--all_wikidata: run all steps for wikidata version,env,pull,download and index
@@ Line 26: / Line 30: @@
    -v|--version: show version of this script
    -wd|--wikidata_download: download wikidata data dump
-   -wi|--wikidata_index: download wikidata data dump
+   -wi|--wikidata_index: build the index for the  wikidata data dump
 This helper script simplifies the access to the steps outlined in https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md
 </source>
-= qlever =
+= qleverauto =
 <source lang='bash'>
 #!/bin/bash
 #
-# a script for getting started with QLever
+# a script for getting started with QLever and automatic tasks for
+# it
 #
 # see https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md
 # see https://wiki.bitplan.com/index.php/QLever
+# see https://github.com/ad-freiburg/qlever-control for the
+#   official qlever control script
 #
 #
@@ Line 48: / Line 56: @@
 dockerimage="qlever"
 port=7001
-version="$Revision: 1.27 $"
+version="$Revision: 1.29 $"
-versionDate="$Date: 2022/03/16 08:54:18 $"
+versionDate="$Date: 2022/05/23 06:15:28 $"
 startTime=0
@@ Line 338: / Line 346: @@
 #
 wikidata_download() {
-  local l_target=qlever-indices/wikidata
    local l_base=https://dumps.wikimedia.org/wikidatawiki/entities/
    local l_dump=latest-all.ttl.bz2
    local l_lexemes=latest-lexemes.ttl.bz2
-   wikidata_copyconfig
+   #wikidata_copyconfig
-   target=$QLEVER_HOME/qlever-indices/wikidata
+   target=$QLEVER_HOME/wikidata
    download "wikidata lexemes" "3min" $target $l_lexemes $l_base/$l_lexemes
    download "wikidata dump" "6hours" $target $l_dump $l_base/$l_dump
@@ Line 352: / Line 359: @@
 #
 wikidata_index() {
-    cd $QLEVER_HOME
+    cd $QLEVER_HOME/wikidata
     chmod o+w .
     show_timing "creating wikidata index" "started"
-   docker run -i --rm -v $QLEVER_HOME/qlever-indices/wikidata:/index --entrypoint bash $dockerimage  -c "cd /index && bzcat latest-all.ttl.bz2 latest-lexemes.ttl.bz2 | IndexBuilderMain -F ttl -f - -l -i wikidata -s wikidata.settings.json | tee wikidata.index-log.txt"
+#   docker run -i --rm -v $QLEVER_HOME/qlever-indices/wikidata:/index --entrypoint bash $dockerimage  -c "cd /index && bzcat latest-all.ttl.bz2 latest-lexemes.ttl.bz2 | IndexBuilderMain -F ttl -f - -l -i wikidata -s wikidata.settings.json | tee wikidata.index-log.txt"
+   . ../qlever-control/qlever
+   check_installed IndexBuilderMain
+   qlever index
     show_timing "creating wikidata index" "finished"
 }
@@ Line 418: / Line 428: @@
    esac
 done
+</source>
+= logstats =
+script to summarize progress and statistics from qlever index log. Please note that this script uses german locale for the
+Spreadsheet commands such as "RUNDEN". It's unfortunate that spreadsheet programs such as Excel and Numbers use this locale specific naming by default.
+<source lang='bash'>
+#!/bin/bash
+# WF 2022-03-12
+# get the relevant log information for the indexer
+# $Header: /hd/seel/qlever/RCS/logstats,v 1.2 2022/05/23 06:24:23 wf Exp wf $
+logfile=wikidata/wikidata.index-log.txt
+echo 'day;time;phase;mill triples;duration;mill triples/h;todo;ETA h' > stats.csv
+cat $logfile \
+	| sed 's/ /;/g' \
+	| sed 's/	-//g' \
+	| sed 's/,//g' \
+	| sed 's/\.[[:digit:]]\+//g' \
+| awk -v expectedTriples=17400 -v expectedBoM=900 -v expectedUoM=3200 -v expectedConversion=27300 -v expectedWords=900 '
+BEGIN {
+  # Field separator
+  FS=";"
+	# double quote
+  quote="\x22"
+}
+# default extraction from line
+# 2022-05-22 17:48:22.564 ...
+{
+  #print $0
+  date=$1
+  time=$2
+}
+# start of Processing phase
+# 2022-05-22 17:48:22.564	- INFO:  Processing input triples from /dev/stdin ...
+/Processing;input;triples;from/ {
+  phase="Processing"
+  printStartPhase(date,time,phase,expectedTriples)
+  row=3
+  next
+}
+# while processing
+# 2022-05-23 00:09:50.846	- INFO:  Input triples processed: 17,400,000,000
+/Input;triples;processed:;/{
+  triples=$8
+  next
+}
+# Start of byte order Merging
+# 2022-05-23 00:10:52.614	- INFO:  Merging partial vocabularies in byte order (internal only) ...
+/Merging;partial;vocabularies;in;byte;order/ {
+  printrow(date,time,triples,row,phase)
+  phase="Byte order merging"
+  printStartPhase(date,time,phase,expectedBoM)
+  row=5
+  next
+}
+/Words;merged:;/ {
+  triples=$7
+	next
+}
+/Words;processed:;/ {
+  triples=$7
+	next
+}
+/Merging;partial;vocabularies;in;Unicode;order/ {
+  printrow(date,time,triples,row,phase)
+  phase="Unicode order merging"
+  printStartPhase(date,time,phase,expectedUoM)
+  row=7
+	next
+}
+/Converting;triples;from;local;/ {
+  printrow(date,time,triples,row,phase)
+  phase="Triple conversion"
+  printStartPhase(date,time,phase,expectedConversion)
+  row=9
+}
+/Triples;converted:;/ {
+  triples=$7
+	next
+}
+/Building;/ {
+	printrow(date,time,triples,row,phase)
+	phase="Prefix tree"
+	printStartPhase(date,time,phase,expectedWords)
+	row=11
+	next
+}
+/Computing;maximally/ {
+	printrow(date,time,triples,row,phase)
+	phase="Compressing prefixes"
+	printStartPhase(date,time,phase,expectedTriples)
+	row=13
+	triples=0
+	next
+}
+/Writing;compressed;vocabulary/ {
+	printrow(date,time,triples,row,phase)
+	phase="PSO/POS index pair"
+	printStartPhase(date,time,phase,expectedTriples)
+	row=15
+	triples=0
+	next
+}
+/Writing;meta;data;for;PSO/ {
+	printrow(date,time,triples,row,phase)
+	phase="SPO/SOP index pair"
+	printStartPhase(date,time,phase,expectedTriples)
+	row=17
+	triples=0
+	next
+}
+/Writing;meta;data;for;SPO/ {
+	printrow(date,time,triples,row,phase)
+	phase="new index pair"
+	printStartPhase(date,time,phase,expectedTriples)
+	row=19
+	triples=0
+	next
+}
+function printStartPhase(date,time,phase,expected) {
+  printf("%s;%s;%s;%d;;;\n",date,time,phase,expected)
+}
+function printrow(date,time,triples,row,phase) {
+  printf("%s;%s;%s;%s;=(A%d+B%d)-(A%d+B%d);%s=Runden(D%d/E%d;0)%s;=D%d-D%d;%s=Runden(G%d/F%d;1)%s\n",date,time,phase,triples/1000000,row,row,row-1,row-1,quote,row,row,quote,row-1,row,quote,row,row,quote)
+}
+END {
+  printrow(date,time,triples,row,phase)
+  printf(";;total;;=SUMME(E$2:E%d)\n",row)
+}
+' >> stats.csv
+cat stats.csv
+# open in spreadsheet
+open stats.csv
 </source>