QLever/script

From BITPlan Wiki
Jump to navigation Jump to search

The qleverauto script below is for automation only now. See https://github.com/ad-freiburg/qlever-control for the "official" script for indexing and starting.

This is a script for getting started with QLever along the lines of the Quickstart description

see

This script has been renamed to "qleverauto" from "qlever" on 2022-05-23 since "qlever" is now the name of the official qlever-control script provided with https://github.com/ad-freiburg/qlever-control

usage

/qleverauto -h
usage: ./qleverauto [-h|--help|...]
  -h|--help: show this usage
  -aw|--all_wikidata: run all steps for wikidata version,env,pull,download and index
  -b|--build: build qlever docker image
  -p|--pull: pull qlever docker image
  --port <port> port to server endpoint from, default: 7001
  -s|--server: start SPARQL server
  -c|--clone: clone qlever
  -e|--env: show, check and modify environment
  -v|--version: show version of this script
  -wd|--wikidata_download: download wikidata data dump
  -wi|--wikidata_index: build the index for the  wikidata data dump

This helper script simplifies the access to the steps outlined in https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md

qleverauto

#!/bin/bash
#
# a script for getting started with QLever and automatic tasks for
# it 
#
# see https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md
# see https://wiki.bitplan.com/index.php/QLever
# see https://github.com/ad-freiburg/qlever-control for the
#   official qlever control script
#
#
# WF 2022-01-28
#

# we assume the script is started from the QLEVER_HOME directory
export QLEVER_HOME=$(pwd)
dockerimage="qlever"
port=7001
version="$Revision: 1.29 $"
versionDate="$Date: 2022/05/23 06:15:28 $"

startTime=0
finishTime=0

#ansi colors
#http://www.csc.uvic.ca/~sae/seng265/fall04/tips/s265s047-tips/bash-using-colors.html
blue='\033[0;34m'
red='\033[0;31m'
green='\033[0;32m' # '\e[1;32m' is too bright for white bg.
endColor='\033[0m'

#
# a colored message
#   params:
#     1: l_color - the color of the message
#     2: l_msg - the message to display
#
color_msg() {
  local l_color="$1"
  local l_msg="$2"
  echo -e "${l_color}$l_msg${endColor}"
}

#
# error
#
#   show an error message and exit
#
#   params:
#     1: l_msg - the message to display
error() {
  local l_msg="$1"
  # use ansi red for error
  color_msg $red "Error: $l_msg" 1>&2
  exit 1
}

#
# show the usage
#
usage() {
  echo "usage: $0 [-h|--help|...]"
  echo "  -h|--help: show this usage"
  echo "  -aw|--all_wikidata: run all steps for wikidata version,env,pull,download and index"
  echo "  -b|--build: build qlever docker image"
  echo "  -p|--pull: pull qlever docker image"
  echo "  --port <port> port to server endpoint from, default: $port"
  echo "  -s|--server: start SPARQL server"
  echo "  -c|--clone: clone qlever"
  echo "  -e|--env: show, check and modify environment"
  echo "  -v|--version: show version of this script"
  echo "  -wd|--wikidata_download: download wikidata data dump"
  echo "  -wi|--wikidata_index: build the index for the  wikidata data dump"
  echo ""
  echo "This helper script simplifies the access to the steps outlined in https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md"
}

#
# show the start of an action
#
show_timing() {
  local l_action="$1"
  local l_state="$2"
  now=$(date)
  case $l_state in
    started)
       startTime=$SECONDS
       after=""
       ;;
    finished)
      finishTime=$SECONDS
      local l_duration=$(( $finishTime - $startTime ))
      after=" after $l_duration seconds"
      ;;
  esac
  color_msg $blue "$l_action $l_state at $now$after"
}

#
# show the version of this script
#
show_version() {
  local l_script=$(basename $0)
  color_msg $blue "$l_script version $version $versionDate"
}

#
# check whether program is installed
#
#  #1: l_prog - the program to check
#
check_installed() {
  local l_prog="$1"
  local l_installed="✅"
  local l_color=$green
  local l_progbin=$(which $l_prog)
  which $l_prog > /dev/null 
  if [ $? -ne 0 ]
  then
    l_installed="❌"
    l_color=$red
  fi
  color_msg $l_color "$l_prog$l_progbin $l_installed"
}

#
# show and modify the environment
#
show_env() {
  local l_progs="docker top df jq"
  case $(uname -a) in
    Darwin*)
      l_progs="$l_progs sw_vers"
      ;;
    *) 
      l_progs="$l_progs lsb_release free"
     ;;
  esac
  color_msg $blue "needed software"
  for l_prog in $l_progs
  do
    check_installed $l_prog
  done
  color_msg $blue "operating system"
  local l_disks="/dev/s"
  case $(uname -a) in
    Darwin*)
      l_disks="/dev/disk"
      sw_vers;;
    *)
      lsb_release -a
  esac
  color_msg $blue "docker version"
  docker --version
  color_msg $blue "memory"
  case $(uname -a) in
    Darwin*)
      top -l 1 | grep PhysMem | cut -f1,2 -d" "
      ;;
    *) free -h
      ;;
  esac
  color_msg $blue "diskspace"
  df -h | grep $l_disks
  ulimit -Sn 1048576
  color_msg $blue "soft ulimit for files"
  ulimit -Sn
}

#
# check whether the given process runs and kill it if yes
# param #1: l_process  - the process to check
#
killIfRunning() {
  local l_process="$1"
  pgrep -fl "$l_process"
  local l_presult=$?
  if [ $l_presult -eq 0 ]
  then
    color_msg "$l_process already running"
    # comment out as you like
    # either stop here
    #echo "to kill the process you might want to use"
    #echo "pkill -f $l_process"
    #exit 1
    # or fully automatic kill
    color_msg "killing $l_process"
    pkill -f "$l_process"
  fi
}
#
# kill a running qlever process
# 
qlever_kill() {
  killIfRunning qlever
}

#
# pull the qlever image
#
qlever_pull() {
  dockerimage="adfreiburg/qlever"
  show_timing "pulling qlever docker image" "started"
  docker pull $dockerimage
  show_timing "pulling qlever docker image" "finished"
}

#
# start the SPARQL server
#
qlever_start() {
  local l_port="$1"
  docker run --rm -v $QLEVER_HOME/qlever-indices/wikidata:/index \
      -p $l_port:7001 -e INDEX_PREFIX=wikidata --name qlever.wikidata $dockerimage 
}

#
# clone the qlever code
#
qlever_clone() {
  cd $QLEVER_HOME
  if [ ! -d qlever-code ]
  then
    color_msg $blue "cloning qlever - please wait typically 1 min ..."
    show_timing "cloning qlever" "started"
    git clone --recursive https://github.com/ad-freiburg/qlever qlever-code
    show_timing "cloning qlever" "finished"
  else
    color_msg $green "clone of clever-code already available"
  fi
}

#
# build the docker image
#
qlever_build() {
  #docker images | grep qlever
  #if [ $? -ne 0 ]
  #then
     cd $QLEVER_HOME/qlever-code
     color_msg $blue "building qlever - please wait typically 15 min ..."
     show_timing "docker build" "started"
     # docker build -t qlever .
     docker build --file Dockerfiles/Dockerfile.Ubuntu20.04 -t qlever .
     show_timing "docker build" "finished"
  #else
  #  color_msg $green "qlever image already build"
  #fi
}

#
# generic download
#
# params
#   1: title of the download
#   2: expected time
#   3: target directory
#   4: file expected
#   5: url to download from
#
download() {
  local l_title="$1"
  local l_expected="$2"
  local l_target="$3"
  local l_file="$4"
  local l_url="$5"
  # check if file already exists
  cd $l_target
  if [ -f $l_file ]
  then 
    color_msg $green "$l_title:$l_file already downloaded"
  else 
    color_msg $blue "downloading $l_title:$l_file ... please wait typically $l_expected ..."
    show_timing "$l_title download" "started"
    wget $l_url
    show_timing "$l_title download" "finished"
  fi
}

#
# wikidata config copy 
#
wikidata_copyconfig() {
  cd $QLEVER_HOME
  target=qlever-indices/wikidata
  if [ ! -d $target ]
  then
     color_msg $blue "creating $target"
     mkdir -p $target
  else
     color_msg $green "$target already exists"
  fi
  cd $target
  config=wikidata.settings.json
  configpath=$QLEVER_HOME/qlever-code/examples/$config
  if [ ! -f $config ]
  then
    color_msg $blue "copying config file $configpath to $target"
    cp -p $configpath .
  else
    color_msg $green "$config already copied to $target"
  fi
}

#
# wikidata download
#
wikidata_download() {
  local l_base=https://dumps.wikimedia.org/wikidatawiki/entities/
  local l_dump=latest-all.ttl.bz2
  local l_lexemes=latest-lexemes.ttl.bz2
  #wikidata_copyconfig
  target=$QLEVER_HOME/wikidata
  download "wikidata lexemes" "3min" $target $l_lexemes $l_base/$l_lexemes
  download "wikidata dump" "6hours" $target $l_dump $l_base/$l_dump
}

#
# build the wikidata index
#
wikidata_index() {
   cd $QLEVER_HOME/wikidata
   chmod o+w .
   show_timing "creating wikidata index" "started"
#   docker run -i --rm -v $QLEVER_HOME/qlever-indices/wikidata:/index --entrypoint bash $dockerimage  -c "cd /index && bzcat latest-all.ttl.bz2 latest-lexemes.ttl.bz2 | IndexBuilderMain -F ttl -f - -l -i wikidata -s wikidata.settings.json | tee wikidata.index-log.txt"
   . ../qlever-control/qlever
   check_installed IndexBuilderMain
   qlever index
   show_timing "creating wikidata index" "finished"
}

# commandline options according to usage
while [  "$1" != ""  ]
do
  option=$1
  shift
  case $option in
    -h|--help)
       usage;;
    -aw|--all_wikidata)
      show_version
      show_env
      qlever_clone
      #qlever_build
      qlever_pull
      wikidata_download
      wikidata_index
      ;;
    -b|--build)
       qlever_build
       ;;
    -c|--clone)
       qlever_clone
       ;;
    -e|--env)
       show_env
       ;;
    -k|--kill)
       qlever_kill
       ;;
    -p|--pull)
       qlever_pull
       ;;
    --port)
       if [ $# -lt 1 ]
       then
         usage
       fi
       port=$1
       shift
       ;;
    -s|--server)
       qlever_start $port
       ;;
    -wd|--wikidata_download)
       wikidata_download
       ;;
    -wi|--wikidata_index)
       wikidata_index
       ;;
    -v|--version)
       show_version
       ;;
    -t)
      show_timing "testing" "started"
      sleep 2
      show_timing "testing" "finished"
     ;;
  esac
done

logstats

script to summarize progress and statistics from qlever index log

#!/bin/bash
# WF 2022-03-12
# get the relevant log information for the indexer
# $Header: /hd/seel/qlever/RCS/logstats,v 1.2 2022/05/23 06:24:23 wf Exp wf $

logfile=wikidata/wikidata.index-log.txt
echo 'day;time;phase;mill triples;duration;mill triples/h;todo;ETA h' > stats.csv
cat $logfile \
	| sed 's/ /;/g' \
	| sed 's/	-//g' \
	| sed 's/,//g' \
	| sed 's/\.[[:digit:]]\+//g' \
| awk -v expectedTriples=17400 -v expectedBoM=900 -v expectedUoM=3200 -v expectedConversion=27300 -v expectedWords=900 '
BEGIN {
  # Field separator
  FS=";"
	# double quote
  quote="\x22"
}
# default extraction from line
# 2022-05-22 17:48:22.564 ...
{
  #print $0
  date=$1
  time=$2
}
# start of Processing phase
# 2022-05-22 17:48:22.564	- INFO:  Processing input triples from /dev/stdin ...
/Processing;input;triples;from/ {
  phase="Processing"
  printStartPhase(date,time,phase,expectedTriples)
  row=3
  next
}
# while processing
# 2022-05-23 00:09:50.846	- INFO:  Input triples processed: 17,400,000,000
/Input;triples;processed:;/{
  triples=$8
  next
}
# Start of byte order Merging
# 2022-05-23 00:10:52.614	- INFO:  Merging partial vocabularies in byte order (internal only) ...
/Merging;partial;vocabularies;in;byte;order/ {
  printrow(date,time,triples,row,phase)
  phase="Byte order merging"
  printStartPhase(date,time,phase,expectedBoM)
  row=5
  next
}
/Words;merged:;/ {
  triples=$7
	next
}
/Words;processed:;/ {
  triples=$7
	next
}
/Merging;partial;vocabularies;in;Unicode;order/ {
  printrow(date,time,triples,row,phase)
  phase="Unicode order merging"
  printStartPhase(date,time,phase,expectedUoM)
  row=7
	next
}
/Converting;triples;from;local;/ {
  printrow(date,time,triples,row,phase)
  phase="Triple conversion"
  printStartPhase(date,time,phase,expectedConversion)
  row=9
}
/Triples;converted:;/ {
  triples=$7
	next
}
/Building;/ {
	printrow(date,time,triples,row,phase)
	phase="Prefix tree"
	printStartPhase(date,time,phase,expectedWords)
	row=11
	next
}
/Computing;maximally/ {
	printrow(date,time,triples,row,phase)
	phase="Compressing prefixes"
	printStartPhase(date,time,phase,expectedTriples)
	row=13
	triples=0
	next
}
/Writing;compressed;vocabulary/ {
	printrow(date,time,triples,row,phase)
	phase="PSO/POS index pair"
	printStartPhase(date,time,phase,expectedTriples)
	row=15
	triples=0
	next
}
/Writing;meta;data;for;PSO/ {
	printrow(date,time,triples,row,phase)
	phase="SPO/SOP index pair"
	printStartPhase(date,time,phase,expectedTriples)
	row=17
	triples=0
	next
}
/Writing;meta;data;for;SPO/ {
	printrow(date,time,triples,row,phase)
	phase="new index pair"
	printStartPhase(date,time,phase,expectedTriples)
	row=19
	triples=0
	next
}
function printStartPhase(date,time,phase,expected) {
  printf("%s;%s;%s;%d;;;\n",date,time,phase,expected)
}
function printrow(date,time,triples,row,phase) {
  printf("%s;%s;%s;%s;=(A%d+B%d)-(A%d+B%d);%s=Runden(D%d/E%d;0)%s;=D%d-D%d;%s=Runden(G%d/F%d;1)%s\n",date,time,phase,triples/1000000,row,row,row-1,row-1,quote,row,row,quote,row-1,row,quote,row,row,quote)
}
END {
  printrow(date,time,triples,row,phase)
  printf(";;total;;=SUMME(E$2:E%d)\n",row)
}
' >> stats.csv
cat stats.csv
# open in spreadsheet
open stats.csv