Difference between revisions of "QLever/script"
Jump to navigation
Jump to search
Line 428: | Line 428: | ||
esac | esac | ||
done | done | ||
+ | </source> | ||
+ | = logstats = | ||
+ | script to summarize progress and statistics from qlever index log | ||
+ | <source lang='bash'> | ||
+ | #!/bin/bash | ||
+ | # WF 2022-03-12 | ||
+ | # get the relevant log information for the indexer | ||
+ | # $Header: /hd/seel/qlever/RCS/logstats,v 1.2 2022/05/23 06:24:23 wf Exp wf $ | ||
+ | |||
+ | logfile=wikidata/wikidata.index-log.txt | ||
+ | echo 'day;time;phase;mill triples;duration;mill triples/h;todo;ETA h' > stats.csv | ||
+ | cat $logfile \ | ||
+ | | sed 's/ /;/g' \ | ||
+ | | sed 's/ -//g' \ | ||
+ | | sed 's/,//g' \ | ||
+ | | sed 's/\.[[:digit:]]\+//g' \ | ||
+ | | awk -v expectedTriples=17400 -v expectedBoM=900 -v expectedUoM=3200 -v expectedConversion=27300 -v expectedWords=900 ' | ||
+ | BEGIN { | ||
+ | # Field separator | ||
+ | FS=";" | ||
+ | # double quote | ||
+ | quote="\x22" | ||
+ | } | ||
+ | # default extraction from line | ||
+ | # 2022-05-22 17:48:22.564 ... | ||
+ | { | ||
+ | #print $0 | ||
+ | date=$1 | ||
+ | time=$2 | ||
+ | } | ||
+ | # start of Processing phase | ||
+ | # 2022-05-22 17:48:22.564 - INFO: Processing input triples from /dev/stdin ... | ||
+ | /Processing;input;triples;from/ { | ||
+ | phase="Processing" | ||
+ | printStartPhase(date,time,phase,expectedTriples) | ||
+ | row=3 | ||
+ | next | ||
+ | } | ||
+ | # while processing | ||
+ | # 2022-05-23 00:09:50.846 - INFO: Input triples processed: 17,400,000,000 | ||
+ | /Input;triples;processed:;/{ | ||
+ | triples=$8 | ||
+ | next | ||
+ | } | ||
+ | # Start of byte order Merging | ||
+ | # 2022-05-23 00:10:52.614 - INFO: Merging partial vocabularies in byte order (internal only) ... | ||
+ | /Merging;partial;vocabularies;in;byte;order/ { | ||
+ | printrow(date,time,triples,row,phase) | ||
+ | phase="Byte order merging" | ||
+ | printStartPhase(date,time,phase,expectedBoM) | ||
+ | row=5 | ||
+ | next | ||
+ | } | ||
+ | /Words;merged:;/ { | ||
+ | triples=$7 | ||
+ | next | ||
+ | } | ||
+ | /Words;processed:;/ { | ||
+ | triples=$7 | ||
+ | next | ||
+ | } | ||
+ | /Merging;partial;vocabularies;in;Unicode;order/ { | ||
+ | printrow(date,time,triples,row,phase) | ||
+ | phase="Unicode order merging" | ||
+ | printStartPhase(date,time,phase,expectedUoM) | ||
+ | row=7 | ||
+ | next | ||
+ | } | ||
+ | /Converting;triples;from;local;/ { | ||
+ | printrow(date,time,triples,row,phase) | ||
+ | phase="Triple conversion" | ||
+ | printStartPhase(date,time,phase,expectedConversion) | ||
+ | row=9 | ||
+ | } | ||
+ | /Triples;converted:;/ { | ||
+ | triples=$7 | ||
+ | next | ||
+ | } | ||
+ | /Building;/ { | ||
+ | printrow(date,time,triples,row,phase) | ||
+ | phase="Prefix tree" | ||
+ | printStartPhase(date,time,phase,expectedWords) | ||
+ | row=11 | ||
+ | next | ||
+ | } | ||
+ | /Computing;maximally/ { | ||
+ | printrow(date,time,triples,row,phase) | ||
+ | phase="Compressing prefixes" | ||
+ | printStartPhase(date,time,phase,expectedTriples) | ||
+ | row=13 | ||
+ | triples=0 | ||
+ | next | ||
+ | } | ||
+ | /Writing;compressed;vocabulary/ { | ||
+ | printrow(date,time,triples,row,phase) | ||
+ | phase="PSO/POS index pair" | ||
+ | printStartPhase(date,time,phase,expectedTriples) | ||
+ | row=15 | ||
+ | triples=0 | ||
+ | next | ||
+ | } | ||
+ | /Writing;meta;data;for;PSO/ { | ||
+ | printrow(date,time,triples,row,phase) | ||
+ | phase="SPO/SOP index pair" | ||
+ | printStartPhase(date,time,phase,expectedTriples) | ||
+ | row=17 | ||
+ | triples=0 | ||
+ | next | ||
+ | } | ||
+ | /Writing;meta;data;for;SPO/ { | ||
+ | printrow(date,time,triples,row,phase) | ||
+ | phase="new index pair" | ||
+ | printStartPhase(date,time,phase,expectedTriples) | ||
+ | row=19 | ||
+ | triples=0 | ||
+ | next | ||
+ | } | ||
+ | function printStartPhase(date,time,phase,expected) { | ||
+ | printf("%s;%s;%s;%d;;;\n",date,time,phase,expected) | ||
+ | } | ||
+ | function printrow(date,time,triples,row,phase) { | ||
+ | printf("%s;%s;%s;%s;=(A%d+B%d)-(A%d+B%d);%s=Runden(D%d/E%d;0)%s;=D%d-D%d;%s=Runden(G%d/F%d;1)%s\n",date,time,phase,triples/1000000,row,row,row-1,row-1,quote,row,row,quote,row-1,row,quote,row,row,quote) | ||
+ | } | ||
+ | END { | ||
+ | printrow(date,time,triples,row,phase) | ||
+ | printf(";;total;;=SUMME(E$2:E%d)\n",row) | ||
+ | } | ||
+ | ' >> stats.csv | ||
+ | cat stats.csv | ||
+ | # open in spreadsheet | ||
+ | open stats.csv | ||
</source> | </source> |
Revision as of 07:24, 23 May 2022
The qleverauto script below is for automation only now. See https://github.com/ad-freiburg/qlever-control for the "official" script for indexing and starting.
This is a script for getting started with QLever along the lines of the Quickstart description
see
This script has been renamed to "qleverauto" from "qlever" on 2022-05-23 since "qlever" is now the name of the official qlever-control script provided with https://github.com/ad-freiburg/qlever-control
usage
/qleverauto -h
usage: ./qleverauto [-h|--help|...]
-h|--help: show this usage
-aw|--all_wikidata: run all steps for wikidata version,env,pull,download and index
-b|--build: build qlever docker image
-p|--pull: pull qlever docker image
--port <port> port to server endpoint from, default: 7001
-s|--server: start SPARQL server
-c|--clone: clone qlever
-e|--env: show, check and modify environment
-v|--version: show version of this script
-wd|--wikidata_download: download wikidata data dump
-wi|--wikidata_index: build the index for the wikidata data dump
This helper script simplifies the access to the steps outlined in https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md
qleverauto
#!/bin/bash
#
# a script for getting started with QLever and automatic tasks for
# it
#
# see https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md
# see https://wiki.bitplan.com/index.php/QLever
# see https://github.com/ad-freiburg/qlever-control for the
# official qlever control script
#
#
# WF 2022-01-28
#
# we assume the script is started from the QLEVER_HOME directory
export QLEVER_HOME=$(pwd)
dockerimage="qlever"
port=7001
version="$Revision: 1.29 $"
versionDate="$Date: 2022/05/23 06:15:28 $"
startTime=0
finishTime=0
#ansi colors
#http://www.csc.uvic.ca/~sae/seng265/fall04/tips/s265s047-tips/bash-using-colors.html
blue='\033[0;34m'
red='\033[0;31m'
green='\033[0;32m' # '\e[1;32m' is too bright for white bg.
endColor='\033[0m'
#
# a colored message
# params:
# 1: l_color - the color of the message
# 2: l_msg - the message to display
#
color_msg() {
local l_color="$1"
local l_msg="$2"
echo -e "${l_color}$l_msg${endColor}"
}
#
# error
#
# show an error message and exit
#
# params:
# 1: l_msg - the message to display
error() {
local l_msg="$1"
# use ansi red for error
color_msg $red "Error: $l_msg" 1>&2
exit 1
}
#
# show the usage
#
usage() {
echo "usage: $0 [-h|--help|...]"
echo " -h|--help: show this usage"
echo " -aw|--all_wikidata: run all steps for wikidata version,env,pull,download and index"
echo " -b|--build: build qlever docker image"
echo " -p|--pull: pull qlever docker image"
echo " --port <port> port to server endpoint from, default: $port"
echo " -s|--server: start SPARQL server"
echo " -c|--clone: clone qlever"
echo " -e|--env: show, check and modify environment"
echo " -v|--version: show version of this script"
echo " -wd|--wikidata_download: download wikidata data dump"
echo " -wi|--wikidata_index: build the index for the wikidata data dump"
echo ""
echo "This helper script simplifies the access to the steps outlined in https://github.com/ad-freiburg/qlever/blob/master/docs/quickstart.md"
}
#
# show the start of an action
#
show_timing() {
local l_action="$1"
local l_state="$2"
now=$(date)
case $l_state in
started)
startTime=$SECONDS
after=""
;;
finished)
finishTime=$SECONDS
local l_duration=$(( $finishTime - $startTime ))
after=" after $l_duration seconds"
;;
esac
color_msg $blue "$l_action $l_state at $now$after"
}
#
# show the version of this script
#
show_version() {
local l_script=$(basename $0)
color_msg $blue "$l_script version $version $versionDate"
}
#
# check whether program is installed
#
# #1: l_prog - the program to check
#
check_installed() {
local l_prog="$1"
local l_installed="✅"
local l_color=$green
local l_progbin=$(which $l_prog)
which $l_prog > /dev/null
if [ $? -ne 0 ]
then
l_installed="❌"
l_color=$red
fi
color_msg $l_color "$l_prog → $l_progbin $l_installed"
}
#
# show and modify the environment
#
show_env() {
local l_progs="docker top df jq"
case $(uname -a) in
Darwin*)
l_progs="$l_progs sw_vers"
;;
*)
l_progs="$l_progs lsb_release free"
;;
esac
color_msg $blue "needed software"
for l_prog in $l_progs
do
check_installed $l_prog
done
color_msg $blue "operating system"
local l_disks="/dev/s"
case $(uname -a) in
Darwin*)
l_disks="/dev/disk"
sw_vers;;
*)
lsb_release -a
esac
color_msg $blue "docker version"
docker --version
color_msg $blue "memory"
case $(uname -a) in
Darwin*)
top -l 1 | grep PhysMem | cut -f1,2 -d" "
;;
*) free -h
;;
esac
color_msg $blue "diskspace"
df -h | grep $l_disks
ulimit -Sn 1048576
color_msg $blue "soft ulimit for files"
ulimit -Sn
}
#
# check whether the given process runs and kill it if yes
# param #1: l_process - the process to check
#
killIfRunning() {
local l_process="$1"
pgrep -fl "$l_process"
local l_presult=$?
if [ $l_presult -eq 0 ]
then
color_msg "$l_process already running"
# comment out as you like
# either stop here
#echo "to kill the process you might want to use"
#echo "pkill -f $l_process"
#exit 1
# or fully automatic kill
color_msg "killing $l_process"
pkill -f "$l_process"
fi
}
#
# kill a running qlever process
#
qlever_kill() {
killIfRunning qlever
}
#
# pull the qlever image
#
qlever_pull() {
dockerimage="adfreiburg/qlever"
show_timing "pulling qlever docker image" "started"
docker pull $dockerimage
show_timing "pulling qlever docker image" "finished"
}
#
# start the SPARQL server
#
qlever_start() {
local l_port="$1"
docker run --rm -v $QLEVER_HOME/qlever-indices/wikidata:/index \
-p $l_port:7001 -e INDEX_PREFIX=wikidata --name qlever.wikidata $dockerimage
}
#
# clone the qlever code
#
qlever_clone() {
cd $QLEVER_HOME
if [ ! -d qlever-code ]
then
color_msg $blue "cloning qlever - please wait typically 1 min ..."
show_timing "cloning qlever" "started"
git clone --recursive https://github.com/ad-freiburg/qlever qlever-code
show_timing "cloning qlever" "finished"
else
color_msg $green "clone of clever-code already available"
fi
}
#
# build the docker image
#
qlever_build() {
#docker images | grep qlever
#if [ $? -ne 0 ]
#then
cd $QLEVER_HOME/qlever-code
color_msg $blue "building qlever - please wait typically 15 min ..."
show_timing "docker build" "started"
# docker build -t qlever .
docker build --file Dockerfiles/Dockerfile.Ubuntu20.04 -t qlever .
show_timing "docker build" "finished"
#else
# color_msg $green "qlever image already build"
#fi
}
#
# generic download
#
# params
# 1: title of the download
# 2: expected time
# 3: target directory
# 4: file expected
# 5: url to download from
#
download() {
local l_title="$1"
local l_expected="$2"
local l_target="$3"
local l_file="$4"
local l_url="$5"
# check if file already exists
cd $l_target
if [ -f $l_file ]
then
color_msg $green "$l_title:$l_file already downloaded"
else
color_msg $blue "downloading $l_title:$l_file ... please wait typically $l_expected ..."
show_timing "$l_title download" "started"
wget $l_url
show_timing "$l_title download" "finished"
fi
}
#
# wikidata config copy
#
wikidata_copyconfig() {
cd $QLEVER_HOME
target=qlever-indices/wikidata
if [ ! -d $target ]
then
color_msg $blue "creating $target"
mkdir -p $target
else
color_msg $green "$target already exists"
fi
cd $target
config=wikidata.settings.json
configpath=$QLEVER_HOME/qlever-code/examples/$config
if [ ! -f $config ]
then
color_msg $blue "copying config file $configpath to $target"
cp -p $configpath .
else
color_msg $green "$config already copied to $target"
fi
}
#
# wikidata download
#
wikidata_download() {
local l_base=https://dumps.wikimedia.org/wikidatawiki/entities/
local l_dump=latest-all.ttl.bz2
local l_lexemes=latest-lexemes.ttl.bz2
#wikidata_copyconfig
target=$QLEVER_HOME/wikidata
download "wikidata lexemes" "3min" $target $l_lexemes $l_base/$l_lexemes
download "wikidata dump" "6hours" $target $l_dump $l_base/$l_dump
}
#
# build the wikidata index
#
wikidata_index() {
cd $QLEVER_HOME/wikidata
chmod o+w .
show_timing "creating wikidata index" "started"
# docker run -i --rm -v $QLEVER_HOME/qlever-indices/wikidata:/index --entrypoint bash $dockerimage -c "cd /index && bzcat latest-all.ttl.bz2 latest-lexemes.ttl.bz2 | IndexBuilderMain -F ttl -f - -l -i wikidata -s wikidata.settings.json | tee wikidata.index-log.txt"
. ../qlever-control/qlever
check_installed IndexBuilderMain
qlever index
show_timing "creating wikidata index" "finished"
}
# commandline options according to usage
while [ "$1" != "" ]
do
option=$1
shift
case $option in
-h|--help)
usage;;
-aw|--all_wikidata)
show_version
show_env
qlever_clone
#qlever_build
qlever_pull
wikidata_download
wikidata_index
;;
-b|--build)
qlever_build
;;
-c|--clone)
qlever_clone
;;
-e|--env)
show_env
;;
-k|--kill)
qlever_kill
;;
-p|--pull)
qlever_pull
;;
--port)
if [ $# -lt 1 ]
then
usage
fi
port=$1
shift
;;
-s|--server)
qlever_start $port
;;
-wd|--wikidata_download)
wikidata_download
;;
-wi|--wikidata_index)
wikidata_index
;;
-v|--version)
show_version
;;
-t)
show_timing "testing" "started"
sleep 2
show_timing "testing" "finished"
;;
esac
done
logstats
script to summarize progress and statistics from qlever index log
#!/bin/bash
# WF 2022-03-12
# get the relevant log information for the indexer
# $Header: /hd/seel/qlever/RCS/logstats,v 1.2 2022/05/23 06:24:23 wf Exp wf $
logfile=wikidata/wikidata.index-log.txt
echo 'day;time;phase;mill triples;duration;mill triples/h;todo;ETA h' > stats.csv
cat $logfile \
| sed 's/ /;/g' \
| sed 's/ -//g' \
| sed 's/,//g' \
| sed 's/\.[[:digit:]]\+//g' \
| awk -v expectedTriples=17400 -v expectedBoM=900 -v expectedUoM=3200 -v expectedConversion=27300 -v expectedWords=900 '
BEGIN {
# Field separator
FS=";"
# double quote
quote="\x22"
}
# default extraction from line
# 2022-05-22 17:48:22.564 ...
{
#print $0
date=$1
time=$2
}
# start of Processing phase
# 2022-05-22 17:48:22.564 - INFO: Processing input triples from /dev/stdin ...
/Processing;input;triples;from/ {
phase="Processing"
printStartPhase(date,time,phase,expectedTriples)
row=3
next
}
# while processing
# 2022-05-23 00:09:50.846 - INFO: Input triples processed: 17,400,000,000
/Input;triples;processed:;/{
triples=$8
next
}
# Start of byte order Merging
# 2022-05-23 00:10:52.614 - INFO: Merging partial vocabularies in byte order (internal only) ...
/Merging;partial;vocabularies;in;byte;order/ {
printrow(date,time,triples,row,phase)
phase="Byte order merging"
printStartPhase(date,time,phase,expectedBoM)
row=5
next
}
/Words;merged:;/ {
triples=$7
next
}
/Words;processed:;/ {
triples=$7
next
}
/Merging;partial;vocabularies;in;Unicode;order/ {
printrow(date,time,triples,row,phase)
phase="Unicode order merging"
printStartPhase(date,time,phase,expectedUoM)
row=7
next
}
/Converting;triples;from;local;/ {
printrow(date,time,triples,row,phase)
phase="Triple conversion"
printStartPhase(date,time,phase,expectedConversion)
row=9
}
/Triples;converted:;/ {
triples=$7
next
}
/Building;/ {
printrow(date,time,triples,row,phase)
phase="Prefix tree"
printStartPhase(date,time,phase,expectedWords)
row=11
next
}
/Computing;maximally/ {
printrow(date,time,triples,row,phase)
phase="Compressing prefixes"
printStartPhase(date,time,phase,expectedTriples)
row=13
triples=0
next
}
/Writing;compressed;vocabulary/ {
printrow(date,time,triples,row,phase)
phase="PSO/POS index pair"
printStartPhase(date,time,phase,expectedTriples)
row=15
triples=0
next
}
/Writing;meta;data;for;PSO/ {
printrow(date,time,triples,row,phase)
phase="SPO/SOP index pair"
printStartPhase(date,time,phase,expectedTriples)
row=17
triples=0
next
}
/Writing;meta;data;for;SPO/ {
printrow(date,time,triples,row,phase)
phase="new index pair"
printStartPhase(date,time,phase,expectedTriples)
row=19
triples=0
next
}
function printStartPhase(date,time,phase,expected) {
printf("%s;%s;%s;%d;;;\n",date,time,phase,expected)
}
function printrow(date,time,triples,row,phase) {
printf("%s;%s;%s;%s;=(A%d+B%d)-(A%d+B%d);%s=Runden(D%d/E%d;0)%s;=D%d-D%d;%s=Runden(G%d/F%d;1)%s\n",date,time,phase,triples/1000000,row,row,row-1,row-1,quote,row,row,quote,row-1,row,quote,row,row,quote)
}
END {
printrow(date,time,triples,row,phase)
printf(";;total;;=SUMME(E$2:E%d)\n",row)
}
' >> stats.csv
cat stats.csv
# open in spreadsheet
open stats.csv