Difference between revisions of "QLever/script"
Jump to navigation
Jump to search
(→qlever) |
|||
(14 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
+ | __TOC__ | ||
+ | The qleverauto script below is for automation only now. | ||
+ | See https://github.com/ad-freiburg/qlever-control for the "official" script for indexing and starting. | ||
+ | |||
This is a script for getting started with {{Link|target=QLever}} along the lines of the [https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md Quickstart] description | This is a script for getting started with {{Link|target=QLever}} along the lines of the [https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md Quickstart] description | ||
Line 10: | Line 14: | ||
|state=open | |state=open | ||
}} | }} | ||
+ | |||
+ | This script has been renamed to "qleverauto" from "qlever" on 2022-05-23 since "qlever" is now the name of the official qlever-control script provided with https://github.com/ad-freiburg/qlever-control | ||
= usage = | = usage = | ||
<source lang='bash'> | <source lang='bash'> | ||
− | usage: ./ | + | /qleverauto -h |
+ | usage: ./qleverauto [-h|--help|...] | ||
-h|--help: show this usage | -h|--help: show this usage | ||
− | -aw|--all_wikidata: run all steps for wikidata | + | -aw|--all_wikidata: run all steps for wikidata version,env,pull,download and index |
-b|--build: build qlever docker image | -b|--build: build qlever docker image | ||
+ | -p|--pull: pull qlever docker image | ||
+ | --port <port> port to server endpoint from, default: 7001 | ||
+ | -s|--server: start SPARQL server | ||
-c|--clone: clone qlever | -c|--clone: clone qlever | ||
− | -e|--env: show environment | + | -e|--env: show, check and modify environment |
+ | -v|--version: show version of this script | ||
-wd|--wikidata_download: download wikidata data dump | -wd|--wikidata_download: download wikidata data dump | ||
+ | -wi|--wikidata_index: build the index for the wikidata data dump | ||
This helper script simplifies the access to the steps outlined in https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md | This helper script simplifies the access to the steps outlined in https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md | ||
+ | |||
</source> | </source> | ||
− | = | + | = qleverauto = |
<source lang='bash'> | <source lang='bash'> | ||
#!/bin/bash | #!/bin/bash | ||
# | # | ||
− | # a script for getting started with QLever | + | # a script for getting started with QLever and automatic tasks for |
+ | # it | ||
# | # | ||
# see https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md | # see https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md | ||
# see https://wiki.bitplan.com/index.php/QLever | # see https://wiki.bitplan.com/index.php/QLever | ||
+ | # see https://github.com/ad-freiburg/qlever-control for the | ||
+ | # official qlever control script | ||
# | # | ||
# | # | ||
Line 38: | Line 54: | ||
# we assume the script is started from the QLEVER_HOME directory | # we assume the script is started from the QLEVER_HOME directory | ||
export QLEVER_HOME=$(pwd) | export QLEVER_HOME=$(pwd) | ||
− | + | dockerimage="qlever" | |
+ | port=7001 | ||
+ | version="$Revision: 1.29 $" | ||
+ | versionDate="$Date: 2022/05/23 06:15:28 $" | ||
startTime=0 | startTime=0 | ||
Line 80: | Line 99: | ||
# | # | ||
usage() { | usage() { | ||
− | echo "usage: $0 [-h|--help]" | + | echo "usage: $0 [-h|--help|...]" |
echo " -h|--help: show this usage" | echo " -h|--help: show this usage" | ||
− | echo " -aw|--all_wikidata: run all steps for wikidata" | + | echo " -aw|--all_wikidata: run all steps for wikidata version,env,pull,download and index" |
echo " -b|--build: build qlever docker image" | echo " -b|--build: build qlever docker image" | ||
+ | echo " -p|--pull: pull qlever docker image" | ||
+ | echo " --port <port> port to server endpoint from, default: $port" | ||
+ | echo " -s|--server: start SPARQL server" | ||
echo " -c|--clone: clone qlever" | echo " -c|--clone: clone qlever" | ||
− | echo " -e|--env: show environment" | + | echo " -e|--env: show, check and modify environment" |
+ | echo " -v|--version: show version of this script" | ||
echo " -wd|--wikidata_download: download wikidata data dump" | echo " -wd|--wikidata_download: download wikidata data dump" | ||
+ | echo " -wi|--wikidata_index: build the index for the wikidata data dump" | ||
echo "" | echo "" | ||
echo "This helper script simplifies the access to the steps outlined in https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md" | echo "This helper script simplifies the access to the steps outlined in https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md" | ||
Line 113: | Line 137: | ||
# | # | ||
− | # show the environment | + | # show the version of this script |
+ | # | ||
+ | show_version() { | ||
+ | local l_script=$(basename $0) | ||
+ | color_msg $blue "$l_script version $version $versionDate" | ||
+ | } | ||
+ | |||
+ | # | ||
+ | # check whether program is installed | ||
+ | # | ||
+ | # #1: l_prog - the program to check | ||
+ | # | ||
+ | check_installed() { | ||
+ | local l_prog="$1" | ||
+ | local l_installed="✅" | ||
+ | local l_color=$green | ||
+ | local l_progbin=$(which $l_prog) | ||
+ | which $l_prog > /dev/null | ||
+ | if [ $? -ne 0 ] | ||
+ | then | ||
+ | l_installed="❌" | ||
+ | l_color=$red | ||
+ | fi | ||
+ | color_msg $l_color "$l_prog → $l_progbin $l_installed" | ||
+ | } | ||
+ | |||
+ | # | ||
+ | # show and modify the environment | ||
# | # | ||
show_env() { | show_env() { | ||
+ | local l_progs="docker top df jq" | ||
+ | case $(uname -a) in | ||
+ | Darwin*) | ||
+ | l_progs="$l_progs sw_vers" | ||
+ | ;; | ||
+ | *) | ||
+ | l_progs="$l_progs lsb_release free" | ||
+ | ;; | ||
+ | esac | ||
+ | color_msg $blue "needed software" | ||
+ | for l_prog in $l_progs | ||
+ | do | ||
+ | check_installed $l_prog | ||
+ | done | ||
color_msg $blue "operating system" | color_msg $blue "operating system" | ||
local l_disks="/dev/s" | local l_disks="/dev/s" | ||
Line 137: | Line 202: | ||
color_msg $blue "diskspace" | color_msg $blue "diskspace" | ||
df -h | grep $l_disks | df -h | grep $l_disks | ||
+ | ulimit -Sn 1048576 | ||
+ | color_msg $blue "soft ulimit for files" | ||
+ | ulimit -Sn | ||
+ | } | ||
+ | |||
+ | # | ||
+ | # check whether the given process runs and kill it if yes | ||
+ | # param #1: l_process - the process to check | ||
+ | # | ||
+ | killIfRunning() { | ||
+ | local l_process="$1" | ||
+ | pgrep -fl "$l_process" | ||
+ | local l_presult=$? | ||
+ | if [ $l_presult -eq 0 ] | ||
+ | then | ||
+ | color_msg "$l_process already running" | ||
+ | # comment out as you like | ||
+ | # either stop here | ||
+ | #echo "to kill the process you might want to use" | ||
+ | #echo "pkill -f $l_process" | ||
+ | #exit 1 | ||
+ | # or fully automatic kill | ||
+ | color_msg "killing $l_process" | ||
+ | pkill -f "$l_process" | ||
+ | fi | ||
+ | } | ||
+ | # | ||
+ | # kill a running qlever process | ||
+ | # | ||
+ | qlever_kill() { | ||
+ | killIfRunning qlever | ||
} | } | ||
Line 143: | Line 239: | ||
# | # | ||
qlever_pull() { | qlever_pull() { | ||
+ | dockerimage="adfreiburg/qlever" | ||
show_timing "pulling qlever docker image" "started" | show_timing "pulling qlever docker image" "started" | ||
− | docker pull | + | docker pull $dockerimage |
show_timing "pulling qlever docker image" "finished" | show_timing "pulling qlever docker image" "finished" | ||
− | + | } | |
+ | |||
+ | # | ||
+ | # start the SPARQL server | ||
+ | # | ||
+ | qlever_start() { | ||
+ | local l_port="$1" | ||
+ | docker run --rm -v $QLEVER_HOME/qlever-indices/wikidata:/index \ | ||
+ | -p $l_port:7001 -e INDEX_PREFIX=wikidata --name qlever.wikidata $dockerimage | ||
} | } | ||
Line 189: | Line 294: | ||
# 1: title of the download | # 1: title of the download | ||
# 2: expected time | # 2: expected time | ||
− | # 3: url to download from | + | # 3: target directory |
+ | # 4: file expected | ||
+ | # 5: url to download from | ||
# | # | ||
download() { | download() { | ||
local l_title="$1" | local l_title="$1" | ||
local l_expected="$2" | local l_expected="$2" | ||
− | local l_url="$ | + | local l_target="$3" |
− | color_msg $blue "downloading $l_title ... please wait typically $l_expected ..." | + | local l_file="$4" |
− | + | local l_url="$5" | |
− | + | # check if file already exists | |
− | + | cd $l_target | |
+ | if [ -f $l_file ] | ||
+ | then | ||
+ | color_msg $green "$l_title:$l_file already downloaded" | ||
+ | else | ||
+ | color_msg $blue "downloading $l_title:$l_file ... please wait typically $l_expected ..." | ||
+ | show_timing "$l_title download" "started" | ||
+ | wget $l_url | ||
+ | show_timing "$l_title download" "finished" | ||
+ | fi | ||
} | } | ||
# | # | ||
− | # wikidata | + | # wikidata config copy |
# | # | ||
− | + | wikidata_copyconfig() { | |
cd $QLEVER_HOME | cd $QLEVER_HOME | ||
target=qlever-indices/wikidata | target=qlever-indices/wikidata | ||
if [ ! -d $target ] | if [ ! -d $target ] | ||
then | then | ||
+ | color_msg $blue "creating $target" | ||
mkdir -p $target | mkdir -p $target | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
else | else | ||
− | color_msg $green "wikidata | + | color_msg $green "$target already exists" |
+ | fi | ||
+ | cd $target | ||
+ | config=wikidata.settings.json | ||
+ | configpath=$QLEVER_HOME/qlever-code/examples/$config | ||
+ | if [ ! -f $config ] | ||
+ | then | ||
+ | color_msg $blue "copying config file $configpath to $target" | ||
+ | cp -p $configpath . | ||
+ | else | ||
+ | color_msg $green "$config already copied to $target" | ||
fi | fi | ||
+ | } | ||
+ | |||
+ | # | ||
+ | # wikidata download | ||
+ | # | ||
+ | wikidata_download() { | ||
+ | local l_base=https://dumps.wikimedia.org/wikidatawiki/entities/ | ||
+ | local l_dump=latest-all.ttl.bz2 | ||
+ | local l_lexemes=latest-lexemes.ttl.bz2 | ||
+ | #wikidata_copyconfig | ||
+ | target=$QLEVER_HOME/wikidata | ||
+ | download "wikidata lexemes" "3min" $target $l_lexemes $l_base/$l_lexemes | ||
+ | download "wikidata dump" "6hours" $target $l_dump $l_base/$l_dump | ||
} | } | ||
Line 225: | Line 359: | ||
# | # | ||
wikidata_index() { | wikidata_index() { | ||
− | cd $QLEVER_HOME | + | cd $QLEVER_HOME/wikidata |
chmod o+w . | chmod o+w . | ||
show_timing "creating wikidata index" "started" | show_timing "creating wikidata index" "started" | ||
− | + | # docker run -i --rm -v $QLEVER_HOME/qlever-indices/wikidata:/index --entrypoint bash $dockerimage -c "cd /index && bzcat latest-all.ttl.bz2 latest-lexemes.ttl.bz2 | IndexBuilderMain -F ttl -f - -l -i wikidata -s wikidata.settings.json | tee wikidata.index-log.txt" | |
+ | . ../qlever-control/qlever | ||
+ | check_installed IndexBuilderMain | ||
+ | qlever index | ||
show_timing "creating wikidata index" "finished" | show_timing "creating wikidata index" "finished" | ||
} | } | ||
Line 241: | Line 378: | ||
usage;; | usage;; | ||
-aw|--all_wikidata) | -aw|--all_wikidata) | ||
+ | show_version | ||
show_env | show_env | ||
− | + | qlever_clone | |
#qlever_build | #qlever_build | ||
qlever_pull | qlever_pull | ||
Line 256: | Line 394: | ||
-e|--env) | -e|--env) | ||
show_env | show_env | ||
+ | ;; | ||
+ | -k|--kill) | ||
+ | qlever_kill | ||
;; | ;; | ||
-p|--pull) | -p|--pull) | ||
qlever_pull | qlever_pull | ||
+ | ;; | ||
+ | --port) | ||
+ | if [ $# -lt 1 ] | ||
+ | then | ||
+ | usage | ||
+ | fi | ||
+ | port=$1 | ||
+ | shift | ||
+ | ;; | ||
+ | -s|--server) | ||
+ | qlever_start $port | ||
;; | ;; | ||
-wd|--wikidata_download) | -wd|--wikidata_download) | ||
wikidata_download | wikidata_download | ||
;; | ;; | ||
− | -wi|-- | + | -wi|--wikidata_index) |
wikidata_index | wikidata_index | ||
+ | ;; | ||
+ | -v|--version) | ||
+ | show_version | ||
;; | ;; | ||
-t) | -t) | ||
Line 273: | Line 428: | ||
esac | esac | ||
done | done | ||
+ | </source> | ||
+ | = logstats = | ||
+ | script to summarize progress and statistics from qlever index log. Please note that this script uses german locale for the | ||
+ | Spreadsheet commands such as "RUNDEN". It's unfortunate that spreadsheet programs such as Excel and Numbers use this locale specific naming by default. | ||
+ | <source lang='bash'> | ||
+ | #!/bin/bash | ||
+ | # WF 2022-03-12 | ||
+ | # get the relevant log information for the indexer | ||
+ | # $Header: /hd/seel/qlever/RCS/logstats,v 1.2 2022/05/23 06:24:23 wf Exp wf $ | ||
+ | |||
+ | logfile=wikidata/wikidata.index-log.txt | ||
+ | echo 'day;time;phase;mill triples;duration;mill triples/h;todo;ETA h' > stats.csv | ||
+ | cat $logfile \ | ||
+ | | sed 's/ /;/g' \ | ||
+ | | sed 's/ -//g' \ | ||
+ | | sed 's/,//g' \ | ||
+ | | sed 's/\.[[:digit:]]\+//g' \ | ||
+ | | awk -v expectedTriples=17400 -v expectedBoM=900 -v expectedUoM=3200 -v expectedConversion=27300 -v expectedWords=900 ' | ||
+ | BEGIN { | ||
+ | # Field separator | ||
+ | FS=";" | ||
+ | # double quote | ||
+ | quote="\x22" | ||
+ | } | ||
+ | # default extraction from line | ||
+ | # 2022-05-22 17:48:22.564 ... | ||
+ | { | ||
+ | #print $0 | ||
+ | date=$1 | ||
+ | time=$2 | ||
+ | } | ||
+ | # start of Processing phase | ||
+ | # 2022-05-22 17:48:22.564 - INFO: Processing input triples from /dev/stdin ... | ||
+ | /Processing;input;triples;from/ { | ||
+ | phase="Processing" | ||
+ | printStartPhase(date,time,phase,expectedTriples) | ||
+ | row=3 | ||
+ | next | ||
+ | } | ||
+ | # while processing | ||
+ | # 2022-05-23 00:09:50.846 - INFO: Input triples processed: 17,400,000,000 | ||
+ | /Input;triples;processed:;/{ | ||
+ | triples=$8 | ||
+ | next | ||
+ | } | ||
+ | # Start of byte order Merging | ||
+ | # 2022-05-23 00:10:52.614 - INFO: Merging partial vocabularies in byte order (internal only) ... | ||
+ | /Merging;partial;vocabularies;in;byte;order/ { | ||
+ | printrow(date,time,triples,row,phase) | ||
+ | phase="Byte order merging" | ||
+ | printStartPhase(date,time,phase,expectedBoM) | ||
+ | row=5 | ||
+ | next | ||
+ | } | ||
+ | /Words;merged:;/ { | ||
+ | triples=$7 | ||
+ | next | ||
+ | } | ||
+ | /Words;processed:;/ { | ||
+ | triples=$7 | ||
+ | next | ||
+ | } | ||
+ | /Merging;partial;vocabularies;in;Unicode;order/ { | ||
+ | printrow(date,time,triples,row,phase) | ||
+ | phase="Unicode order merging" | ||
+ | printStartPhase(date,time,phase,expectedUoM) | ||
+ | row=7 | ||
+ | next | ||
+ | } | ||
+ | /Converting;triples;from;local;/ { | ||
+ | printrow(date,time,triples,row,phase) | ||
+ | phase="Triple conversion" | ||
+ | printStartPhase(date,time,phase,expectedConversion) | ||
+ | row=9 | ||
+ | } | ||
+ | /Triples;converted:;/ { | ||
+ | triples=$7 | ||
+ | next | ||
+ | } | ||
+ | /Building;/ { | ||
+ | printrow(date,time,triples,row,phase) | ||
+ | phase="Prefix tree" | ||
+ | printStartPhase(date,time,phase,expectedWords) | ||
+ | row=11 | ||
+ | next | ||
+ | } | ||
+ | /Computing;maximally/ { | ||
+ | printrow(date,time,triples,row,phase) | ||
+ | phase="Compressing prefixes" | ||
+ | printStartPhase(date,time,phase,expectedTriples) | ||
+ | row=13 | ||
+ | triples=0 | ||
+ | next | ||
+ | } | ||
+ | /Writing;compressed;vocabulary/ { | ||
+ | printrow(date,time,triples,row,phase) | ||
+ | phase="PSO/POS index pair" | ||
+ | printStartPhase(date,time,phase,expectedTriples) | ||
+ | row=15 | ||
+ | triples=0 | ||
+ | next | ||
+ | } | ||
+ | /Writing;meta;data;for;PSO/ { | ||
+ | printrow(date,time,triples,row,phase) | ||
+ | phase="SPO/SOP index pair" | ||
+ | printStartPhase(date,time,phase,expectedTriples) | ||
+ | row=17 | ||
+ | triples=0 | ||
+ | next | ||
+ | } | ||
+ | /Writing;meta;data;for;SPO/ { | ||
+ | printrow(date,time,triples,row,phase) | ||
+ | phase="new index pair" | ||
+ | printStartPhase(date,time,phase,expectedTriples) | ||
+ | row=19 | ||
+ | triples=0 | ||
+ | next | ||
+ | } | ||
+ | function printStartPhase(date,time,phase,expected) { | ||
+ | printf("%s;%s;%s;%d;;;\n",date,time,phase,expected) | ||
+ | } | ||
+ | function printrow(date,time,triples,row,phase) { | ||
+ | printf("%s;%s;%s;%s;=(A%d+B%d)-(A%d+B%d);%s=Runden(D%d/E%d;0)%s;=D%d-D%d;%s=Runden(G%d/F%d;1)%s\n",date,time,phase,triples/1000000,row,row,row-1,row-1,quote,row,row,quote,row-1,row,quote,row,row,quote) | ||
+ | } | ||
+ | END { | ||
+ | printrow(date,time,triples,row,phase) | ||
+ | printf(";;total;;=SUMME(E$2:E%d)\n",row) | ||
+ | } | ||
+ | ' >> stats.csv | ||
+ | cat stats.csv | ||
+ | # open in spreadsheet | ||
+ | open stats.csv | ||
</source> | </source> |
Latest revision as of 07:26, 23 May 2022
The qleverauto script below is for automation only now. See https://github.com/ad-freiburg/qlever-control for the "official" script for indexing and starting.
This is a script for getting started with QLever along the lines of the Quickstart description
see
This script has been renamed to "qleverauto" from "qlever" on 2022-05-23 since "qlever" is now the name of the official qlever-control script provided with https://github.com/ad-freiburg/qlever-control
usage
/qleverauto -h
usage: ./qleverauto [-h|--help|...]
-h|--help: show this usage
-aw|--all_wikidata: run all steps for wikidata version,env,pull,download and index
-b|--build: build qlever docker image
-p|--pull: pull qlever docker image
--port <port> port to server endpoint from, default: 7001
-s|--server: start SPARQL server
-c|--clone: clone qlever
-e|--env: show, check and modify environment
-v|--version: show version of this script
-wd|--wikidata_download: download wikidata data dump
-wi|--wikidata_index: build the index for the wikidata data dump
This helper script simplifies the access to the steps outlined in https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md
qleverauto
#!/bin/bash
#
# a script for getting started with QLever and automatic tasks for
# it
#
# see https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md
# see https://wiki.bitplan.com/index.php/QLever
# see https://github.com/ad-freiburg/qlever-control for the
# official qlever control script
#
#
# WF 2022-01-28
#
# we assume the script is started from the QLEVER_HOME directory
export QLEVER_HOME=$(pwd)
dockerimage="qlever"
port=7001
version="$Revision: 1.29 $"
versionDate="$Date: 2022/05/23 06:15:28 $"
startTime=0
finishTime=0
#ansi colors
#http://www.csc.uvic.ca/~sae/seng265/fall04/tips/s265s047-tips/bash-using-colors.html
blue='\033[0;34m'
red='\033[0;31m'
green='\033[0;32m' # '\e[1;32m' is too bright for white bg.
endColor='\033[0m'
#
# a colored message
# params:
# 1: l_color - the color of the message
# 2: l_msg - the message to display
#
color_msg() {
local l_color="$1"
local l_msg="$2"
echo -e "${l_color}$l_msg${endColor}"
}
#
# error
#
# show an error message and exit
#
# params:
# 1: l_msg - the message to display
error() {
local l_msg="$1"
# use ansi red for error
color_msg $red "Error: $l_msg" 1>&2
exit 1
}
#
# show the usage
#
usage() {
echo "usage: $0 [-h|--help|...]"
echo " -h|--help: show this usage"
echo " -aw|--all_wikidata: run all steps for wikidata version,env,pull,download and index"
echo " -b|--build: build qlever docker image"
echo " -p|--pull: pull qlever docker image"
echo " --port <port> port to server endpoint from, default: $port"
echo " -s|--server: start SPARQL server"
echo " -c|--clone: clone qlever"
echo " -e|--env: show, check and modify environment"
echo " -v|--version: show version of this script"
echo " -wd|--wikidata_download: download wikidata data dump"
echo " -wi|--wikidata_index: build the index for the wikidata data dump"
echo ""
echo "This helper script simplifies the access to the steps outlined in https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md"
}
#
# show the start of an action
#
show_timing() {
local l_action="$1"
local l_state="$2"
now=$(date)
case $l_state in
started)
startTime=$SECONDS
after=""
;;
finished)
finishTime=$SECONDS
local l_duration=$(( $finishTime - $startTime ))
after=" after $l_duration seconds"
;;
esac
color_msg $blue "$l_action $l_state at $now$after"
}
#
# show the version of this script
#
show_version() {
local l_script=$(basename $0)
color_msg $blue "$l_script version $version $versionDate"
}
#
# check whether program is installed
#
# #1: l_prog - the program to check
#
check_installed() {
local l_prog="$1"
local l_installed="✅"
local l_color=$green
local l_progbin=$(which $l_prog)
which $l_prog > /dev/null
if [ $? -ne 0 ]
then
l_installed="❌"
l_color=$red
fi
color_msg $l_color "$l_prog → $l_progbin $l_installed"
}
#
# show and modify the environment
#
show_env() {
local l_progs="docker top df jq"
case $(uname -a) in
Darwin*)
l_progs="$l_progs sw_vers"
;;
*)
l_progs="$l_progs lsb_release free"
;;
esac
color_msg $blue "needed software"
for l_prog in $l_progs
do
check_installed $l_prog
done
color_msg $blue "operating system"
local l_disks="/dev/s"
case $(uname -a) in
Darwin*)
l_disks="/dev/disk"
sw_vers;;
*)
lsb_release -a
esac
color_msg $blue "docker version"
docker --version
color_msg $blue "memory"
case $(uname -a) in
Darwin*)
top -l 1 | grep PhysMem | cut -f1,2 -d" "
;;
*) free -h
;;
esac
color_msg $blue "diskspace"
df -h | grep $l_disks
ulimit -Sn 1048576
color_msg $blue "soft ulimit for files"
ulimit -Sn
}
#
# check whether the given process runs and kill it if yes
# param #1: l_process - the process to check
#
killIfRunning() {
local l_process="$1"
pgrep -fl "$l_process"
local l_presult=$?
if [ $l_presult -eq 0 ]
then
color_msg "$l_process already running"
# comment out as you like
# either stop here
#echo "to kill the process you might want to use"
#echo "pkill -f $l_process"
#exit 1
# or fully automatic kill
color_msg "killing $l_process"
pkill -f "$l_process"
fi
}
#
# kill a running qlever process
#
qlever_kill() {
killIfRunning qlever
}
#
# pull the qlever image
#
qlever_pull() {
dockerimage="adfreiburg/qlever"
show_timing "pulling qlever docker image" "started"
docker pull $dockerimage
show_timing "pulling qlever docker image" "finished"
}
#
# start the SPARQL server
#
qlever_start() {
local l_port="$1"
docker run --rm -v $QLEVER_HOME/qlever-indices/wikidata:/index \
-p $l_port:7001 -e INDEX_PREFIX=wikidata --name qlever.wikidata $dockerimage
}
#
# clone the qlever code
#
qlever_clone() {
cd $QLEVER_HOME
if [ ! -d qlever-code ]
then
color_msg $blue "cloning qlever - please wait typically 1 min ..."
show_timing "cloning qlever" "started"
git clone --recursive https://github.com/ad-freiburg/qlever qlever-code
show_timing "cloning qlever" "finished"
else
color_msg $green "clone of clever-code already available"
fi
}
#
# build the docker image
#
qlever_build() {
#docker images | grep qlever
#if [ $? -ne 0 ]
#then
cd $QLEVER_HOME/qlever-code
color_msg $blue "building qlever - please wait typically 15 min ..."
show_timing "docker build" "started"
# docker build -t qlever .
docker build --file Dockerfiles/Dockerfile.Ubuntu20.04 -t qlever .
show_timing "docker build" "finished"
#else
# color_msg $green "qlever image already build"
#fi
}
#
# generic download
#
# params
# 1: title of the download
# 2: expected time
# 3: target directory
# 4: file expected
# 5: url to download from
#
download() {
local l_title="$1"
local l_expected="$2"
local l_target="$3"
local l_file="$4"
local l_url="$5"
# check if file already exists
cd $l_target
if [ -f $l_file ]
then
color_msg $green "$l_title:$l_file already downloaded"
else
color_msg $blue "downloading $l_title:$l_file ... please wait typically $l_expected ..."
show_timing "$l_title download" "started"
wget $l_url
show_timing "$l_title download" "finished"
fi
}
#
# wikidata config copy
#
wikidata_copyconfig() {
cd $QLEVER_HOME
target=qlever-indices/wikidata
if [ ! -d $target ]
then
color_msg $blue "creating $target"
mkdir -p $target
else
color_msg $green "$target already exists"
fi
cd $target
config=wikidata.settings.json
configpath=$QLEVER_HOME/qlever-code/examples/$config
if [ ! -f $config ]
then
color_msg $blue "copying config file $configpath to $target"
cp -p $configpath .
else
color_msg $green "$config already copied to $target"
fi
}
#
# wikidata download
#
wikidata_download() {
local l_base=https://dumps.wikimedia.org/wikidatawiki/entities/
local l_dump=latest-all.ttl.bz2
local l_lexemes=latest-lexemes.ttl.bz2
#wikidata_copyconfig
target=$QLEVER_HOME/wikidata
download "wikidata lexemes" "3min" $target $l_lexemes $l_base/$l_lexemes
download "wikidata dump" "6hours" $target $l_dump $l_base/$l_dump
}
#
# build the wikidata index
#
wikidata_index() {
cd $QLEVER_HOME/wikidata
chmod o+w .
show_timing "creating wikidata index" "started"
# docker run -i --rm -v $QLEVER_HOME/qlever-indices/wikidata:/index --entrypoint bash $dockerimage -c "cd /index && bzcat latest-all.ttl.bz2 latest-lexemes.ttl.bz2 | IndexBuilderMain -F ttl -f - -l -i wikidata -s wikidata.settings.json | tee wikidata.index-log.txt"
. ../qlever-control/qlever
check_installed IndexBuilderMain
qlever index
show_timing "creating wikidata index" "finished"
}
# commandline options according to usage
while [ "$1" != "" ]
do
option=$1
shift
case $option in
-h|--help)
usage;;
-aw|--all_wikidata)
show_version
show_env
qlever_clone
#qlever_build
qlever_pull
wikidata_download
wikidata_index
;;
-b|--build)
qlever_build
;;
-c|--clone)
qlever_clone
;;
-e|--env)
show_env
;;
-k|--kill)
qlever_kill
;;
-p|--pull)
qlever_pull
;;
--port)
if [ $# -lt 1 ]
then
usage
fi
port=$1
shift
;;
-s|--server)
qlever_start $port
;;
-wd|--wikidata_download)
wikidata_download
;;
-wi|--wikidata_index)
wikidata_index
;;
-v|--version)
show_version
;;
-t)
show_timing "testing" "started"
sleep 2
show_timing "testing" "finished"
;;
esac
done
logstats
script to summarize progress and statistics from qlever index log. Please note that this script uses german locale for the Spreadsheet commands such as "RUNDEN". It's unfortunate that spreadsheet programs such as Excel and Numbers use this locale specific naming by default.
#!/bin/bash
# WF 2022-03-12
# get the relevant log information for the indexer
# $Header: /hd/seel/qlever/RCS/logstats,v 1.2 2022/05/23 06:24:23 wf Exp wf $
logfile=wikidata/wikidata.index-log.txt
echo 'day;time;phase;mill triples;duration;mill triples/h;todo;ETA h' > stats.csv
cat $logfile \
| sed 's/ /;/g' \
| sed 's/ -//g' \
| sed 's/,//g' \
| sed 's/\.[[:digit:]]\+//g' \
| awk -v expectedTriples=17400 -v expectedBoM=900 -v expectedUoM=3200 -v expectedConversion=27300 -v expectedWords=900 '
BEGIN {
# Field separator
FS=";"
# double quote
quote="\x22"
}
# default extraction from line
# 2022-05-22 17:48:22.564 ...
{
#print $0
date=$1
time=$2
}
# start of Processing phase
# 2022-05-22 17:48:22.564 - INFO: Processing input triples from /dev/stdin ...
/Processing;input;triples;from/ {
phase="Processing"
printStartPhase(date,time,phase,expectedTriples)
row=3
next
}
# while processing
# 2022-05-23 00:09:50.846 - INFO: Input triples processed: 17,400,000,000
/Input;triples;processed:;/{
triples=$8
next
}
# Start of byte order Merging
# 2022-05-23 00:10:52.614 - INFO: Merging partial vocabularies in byte order (internal only) ...
/Merging;partial;vocabularies;in;byte;order/ {
printrow(date,time,triples,row,phase)
phase="Byte order merging"
printStartPhase(date,time,phase,expectedBoM)
row=5
next
}
/Words;merged:;/ {
triples=$7
next
}
/Words;processed:;/ {
triples=$7
next
}
/Merging;partial;vocabularies;in;Unicode;order/ {
printrow(date,time,triples,row,phase)
phase="Unicode order merging"
printStartPhase(date,time,phase,expectedUoM)
row=7
next
}
/Converting;triples;from;local;/ {
printrow(date,time,triples,row,phase)
phase="Triple conversion"
printStartPhase(date,time,phase,expectedConversion)
row=9
}
/Triples;converted:;/ {
triples=$7
next
}
/Building;/ {
printrow(date,time,triples,row,phase)
phase="Prefix tree"
printStartPhase(date,time,phase,expectedWords)
row=11
next
}
/Computing;maximally/ {
printrow(date,time,triples,row,phase)
phase="Compressing prefixes"
printStartPhase(date,time,phase,expectedTriples)
row=13
triples=0
next
}
/Writing;compressed;vocabulary/ {
printrow(date,time,triples,row,phase)
phase="PSO/POS index pair"
printStartPhase(date,time,phase,expectedTriples)
row=15
triples=0
next
}
/Writing;meta;data;for;PSO/ {
printrow(date,time,triples,row,phase)
phase="SPO/SOP index pair"
printStartPhase(date,time,phase,expectedTriples)
row=17
triples=0
next
}
/Writing;meta;data;for;SPO/ {
printrow(date,time,triples,row,phase)
phase="new index pair"
printStartPhase(date,time,phase,expectedTriples)
row=19
triples=0
next
}
function printStartPhase(date,time,phase,expected) {
printf("%s;%s;%s;%d;;;\n",date,time,phase,expected)
}
function printrow(date,time,triples,row,phase) {
printf("%s;%s;%s;%s;=(A%d+B%d)-(A%d+B%d);%s=Runden(D%d/E%d;0)%s;=D%d-D%d;%s=Runden(G%d/F%d;1)%s\n",date,time,phase,triples/1000000,row,row,row-1,row-1,quote,row,row,quote,row-1,row,quote,row,row,quote)
}
END {
printrow(date,time,triples,row,phase)
printf(";;total;;=SUMME(E$2:E%d)\n",row)
}
' >> stats.csv
cat stats.csv
# open in spreadsheet
open stats.csv