Difference between revisions of "Wikidata Import 2024-10-17"

From BITPlan Wiki
Jump to navigation Jump to search
 
(2 intermediate revisions by the same user not shown)
Line 1: Line 1:
 +
{{PageSequence|prev=Wikidata Import 2024-04-13|next=Wikidata Import 2024-10-24|category=Wikidata|categoryIcon=cloud-download}}
 +
 
=Import=
 
=Import=
  
Line 11: Line 13:
 
|cpu=Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz (16 cores)
 
|cpu=Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz (16 cores)
 
|ram=512
 
|ram=512
 +
|triples=16.3
 
|comment=latest-all.ttl.bz2                                16-Oct-2024 18:01        113822933425 ->/hd/delta/qlever/wikidata_20241017/
 
|comment=latest-all.ttl.bz2                                16-Oct-2024 18:01        113822933425 ->/hd/delta/qlever/wikidata_20241017/
 
|storemode=property
 
|storemode=property
 
}}
 
}}
{{PageSequence|prev=Wikidata Import 2024-04-13|next=|category=Wikidata|categoryIcon=cloud-download}}
 
  
 
= Using qlv script =
 
= Using qlv script =
Line 367: Line 369:
 
2024-10-18 11:05:20.574 - INFO: Triples sorted: 10,000,000  
 
2024-10-18 11:05:20.574 - INFO: Triples sorted: 10,000,000  
 
2024-10-18 12:08:42.508 - INFO: Triples sorted: 14,810,000,000  
 
2024-10-18 12:08:42.508 - INFO: Triples sorted: 14,810,000,000  
 +
2024-10-18 14:59:02.433 - INFO: Triples sorted: 20,265,372,683 [average speed 3.3 M/s, last batch 6.6 M/s, fastest 12.9 M/s, slowest 0.0 M/s]
 +
2024-10-18 14:59:03.078 - INFO: Statistics for PSO: #relations = 57,448, #blocks = 652,524, #triples = 20,265,372,683
 +
2024-10-18 14:59:03.078 - INFO: Statistics for POS: #relations = 57,448, #blocks = 652,524, #triples = 20,265,372,683
 +
2024-10-18 14:59:16.597 - INFO: Index build completed
 +
 +
 +
To enable autocompletion, run the following command, and consider adding it to your `.bashrc` or `.zshrc`:
 +
 +
eval "$(register-python-argcomplete qlever)" && export QLEVER_ARGCOMPLETE_ENABLED=1
 +
 +
 +
Command: start
 +
 +
docker run -d --restart=unless-stopped -u $(id -u):$(id -g) -v /etc/localtime:/etc/localtime:ro -v $(pwd):/index -p 7001:7001 -w /index --init --entrypoint bash --name qlever.server.wikidata docker.io/adfreiburg/qlever:latest -c 'ServerMain -i wikidata -j 8 -p 7001 -m 20G -c 10G -e 1G -k 200 -s 30s -a wikidata_GtdRwNQw9y4x > wikidata.server-log.txt 2>&1'
 +
 +
Starting the QLever server failed (docker: Error response from daemon: Conflict. The container name "/qlever.server.wikidata" is already in use by container "af4497fc45d1d344b5de660c8ff13bedb8d1c1b41154da517ba092715281d7fa". You have to remove (or rename) that container to be able to reuse that name. See 'docker run --help'.)
 +
 +
QLever indexing process completed at Fri Oct 18 02:59:23 PM CEST 2024
 
</source>
 
</source>

Latest revision as of 06:20, 24 October 2024

Import

Import
edit
state  ✅
url  https://wiki.bitplan.com/index.php/Wikidata_Import_2024-10-17
target  QLever
start  2024-10-17
end  2024-10-18
days  0.9
os  Ubuntu 22.04.3 LTS
cpu  Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz (16 cores)
ram  512
triples  16.3
comment  latest-all.ttl.bz2 16-Oct-2024 18:01 113822933425 ->/hd/delta/qlever/wikidata_20241017/


Using qlv script

#!/bin/bash
#
# QLever Wikidata Backup and Indexing Script
# Author: WF
# Date: 2024-10-17
# Version: 1.0
#
# Purpose:
# This script manages daily Wikidata backups and indexing using the QLever SPARQL engine
# in a rotating fashion across multiple disks. It is designed to be run once per day by cron.
#
# Key Functions:
# 1. Rotates between available disks (/hd/*) for daily Wikidata backups
# 2. Ensures disk space and RAM are not overutilized
# 3. Manages Docker containers for indexing and serving, preventing interference
# 4. Sets up multiple SPARQL endpoints with appropriate naming
#
# Color definitions
blue='\033[0;34m'
red='\033[0;31m'
green='\033[0;32m'
endColor='\033[0m'

# Function to display colored messages
color_msg() {
  local l_color="$1"
  local l_msg="$2"
  echo -e "${l_color}$l_msg${endColor}"
}

# Function to display errors
error() {
  local l_msg="$1"
  color_msg $red "Error:" 1>&2
  color_msg $red "\t$l_msg" 1>&2
  exit 1
}

# Function to display negative messages
negative() {
  local l_msg="$1"
  color_msg $red "❌:$l_msg"
}

# Function to display positive messages
positive() {
  local l_msg="$1"
  color_msg $green "✅:$l_msg"
}

# Function to display usage information
usage() {
  echo "Usage: $0 [OPTIONS]"
  echo "Options:"
  echo "  -h, --help             Show this help message"
  echo "  -c, --current          Show the disk currently used by QLever"
  echo "  -d, --debug            Enable debug output"
  echo "  -ir, --index-run       Run QLever wikidata indexing on today's disk"
  echo "  -p, --pull             Pull QLever Docker images"
  echo "  -qc, --qlever-control  setup qlever-control"
  echo "  -s, --space            Show free disk space"
  echo "  -t, --today            Show disk to be used today"
  echo "  -v, --version          Show version information"
  exit 1
}

#
# setup qlever control
#
setup_qlever_control() {
  # Check if /opt/qlever-control already exists
  if [ ! -d "/opt/qlever-control" ]; then
    color_msg $blue "Setting up QLever control in /opt/qlever-control..."
    
    # Create the directory with the right permissions
    sudo mkdir -p /opt/qlever-control
    sudo chown -R root:users /opt/qlever-control
    sudo chmod -R 775 /opt/qlever-control
    
    # Clone the qlever-control repository
    sudo git clone https://github.com/ad-freiburg/qlever-control.git /opt/qlever-control
    cd /opt/qlever-control
    
    # Checkout the correct branch
    sudo git checkout python-qlever
    
    # Install qlever-control with pip
    sudo pip install .
    
    positive "QLever control setup and installed successfully."
  else
    positive "QLever control is already set up in /opt/qlever-control."
  fi

  # Verify if the qlever command is installed
  if which qlever > /dev/null 2>&1; then
    positive "QLever is installed and available at $(which qlever)"
  else
    error "QLever installation failed or is not in the system PATH."
  fi
}


# Function to pull Docker images
pull_docker_images() {
  color_msg $blue "Pulling QLever Docker images..."
  
  if docker pull adfreiburg/qlever; then
    positive "Successfully pulled adfreiburg/qlever"
  else
    negative "Failed to pull adfreiburg/qlever"
  fi
  
  if docker pull adfreiburg/qlever-ui; then
    positive "Successfully pulled adfreiburg/qlever-ui"
  else
    negative "Failed to pull adfreiburg/qlever-ui"
  fi
}

#
# get the disk that is currently in use
#
check_current_disk() {
    # Get the current disk used by qlever.server.wikidata
    current_disk=$(docker inspect qlever.server.wikidata --format '{{ range .Mounts }}{{ .Source }}:{{ .Destination }}{{ printf "\n" }}{{ end }}' | grep "/hd/" | awk -F':' '{print $1}')
    
    # Display the current disk
    if [ -n "$current_disk" ]; then
        echo "$current_disk"
    else
        error "No disk found for qlever.server.wikidata"
    fi
}


#
# get the disk of the day
#
disk_of_the_day() {
    # Define the available disks
    disks=(/hd/alpha /hd/beta /hd/gamma /hd/delta)

    # Get the current day of the week as an index
    day=$(date +%u) # 1 (Monday) to 7 (Sunday)

    # Select the disk based on the day
    disk_index=$(( (day - 1) % ${#disks[@]} ))
    selected_disk=${disks[$disk_index]}

    # Return the selected disk
    echo "$selected_disk"
}

#
# show the available disk space
#
show_disk_space() {
    # Print the header with proper formatting
    printf "%-10s %-15s %10s %10s %4s\n" "Directory" "Device" "Available" "Total" "Type"

    for dir in /hd/*; do
        # Get the device mounted to this directory
        device=$(df $dir | tail -1 | awk '{print $1}')
        
        # Get available and total space using df
        available=$(df -h $dir | tail -1 | awk '{print $4}')
        total=$(df -h $dir | tail -1 | awk '{print $2}')
        
        # Determine if the device is SSD or HDD
        if [[ "$device" == *nvme* ]]; then
            type="SSD"
        else
            base_device=$(echo "$device" | sed 's/[0-9]*$//') # Strip partition number
            rotational=$(cat /sys/block/$(basename $base_device)/queue/rotational)
            if [ "$rotational" -eq 0 ]; then
                type="SSD"
            else
                type="HDD"
            fi
        fi
        
        # Print the results with proper formatting
        printf "%-10s %-15s %10s %10s %4s\n" "$(basename $dir)" "$device" "$available" "$total" "$type"
    done
}

# Function to prepare the directory for today's QLever indexing
prepare_dir() {
    local disk=$(disk_of_the_day)
    local isodate=$(date +%Y%m%d)
    dir="$disk/qlever/wikidata_$isodate"

    sudo mkdir -p "$dir"
    sudo chown -R $(whoami):$(whoami) "$dir"
    sudo chmod -R 775 "$dir"
    echo $dir
}

# Function to execute the QLever indexing inside a screen session with logging
execute_index() {
    local isodate=$(date +%Y%m%d)
    local dir="$1"
    local session="qlever_wikidata_$isodate"
    local scriptfile="$dir/qlever_index.sh"
    local logfile="$dir/screen.log"

    # Create the script that will be run inside the screen session
    cat <<EOF > "$scriptfile"
#!/bin/bash
cd $dir
exec > >(tee -a "$logfile") 2>&1
echo "Starting QLever indexing process at \$(date)"
qlever setup-config wikidata
qlever get-data
qlever index
qlever start
echo "QLever indexing process completed at \$(date)"
EOF
    chmod +x "$scriptfile"
    screen -dmS $session bash -c "$scriptfile"

    # Verify that the screen session started successfully
    if screen -ls | grep -q "$session"; then
        positive "Started screen session $session."
        positive "Logging to $logfile"
    else
        error "Failed to start screen session $session."
    fi
}

# run qlever wikidata indexing with screen on today's disk
create_and_run_index() {
    # Prepare the directory for today's QLever indexing and capture the directory
    dir=$(prepare_dir)
    positive "Created directory $dir"
    
    # Execute the QLever indexing inside a screen session with logging
    execute_index "$dir"
}

# Parse command line options
while [[ "$#" -gt 0 ]]; do
  case $1 in
    -h|--help) usage ;;
    -c|--current) check_current_disk; exit 0 ;;
    -d|--debug) DEBUG=1 ;;
    -ir|--index-run) create_and_run_index; exit 0 ;;
    -p|--pull) pull_docker_images; exit 0 ;;
    -qc|--qlever-control) setup_qlever_control; exit 0 ;;
    -s|--space) show_disk_space; exit 0 ;;
    -t|--today) disk_of_the_day; exit 0 ;;
    -v|--version) echo "Version 1.0"; exit 0 ;;
    *) error "Unknown parameter passed: $1" ;;
  esac
  shift
done

exit 0

Usage

qlv -h
Usage: /home/wf/bin/qlv [OPTIONS]
Options:
  -h, --help             Show this help message
  -c, --current          Show the disk currently used by QLever
  -d, --debug            Enable debug output
  -ir, --index-run       Run QLever wikidata indexing on today's disk
  -p, --pull             Pull QLever Docker images
  -qc, --qlever-control  setup qlever-control
  -s, --space            Show free disk space
  -t, --today            Show disk to be used today
  -v, --version          Show version information

space

qlv -s
Directory  Device           Available      Total Type
alpha      /dev/sdb1             2.0T       3.5T  SSD
beta       /dev/sdc1             2.2T       3.5T  SSD
delta      /dev/sde1             3.3T       3.5T  SSD
eneco      /dev/sda1             8.7T        11T  HDD
gamma      /dev/sdd1             3.2T       3.5T  SSD
mantax     /dev/nvme0n1p1        1.1T       5.8T  SSD

today

qlv --today
/hd/delta

indexing

qlv -ir
✅:Created directory /hd/delta/qlever/wikidata_20241017
✅:Started screen session qlever_wikidata_20241017.
✅:Logging to /hd/delta/qlever/wikidata_20241017/screen.log
tail -f /hd/delta/qlever/wikidata_20241017/screen.log
eval "$(register-python-argcomplete qlever)" && export QLEVER_ARGCOMPLETE_ENABLED=1


Command: get-data

curl -LRC - --remote-name-all https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.ttl.bz2 https://dumps.wikimedia.org/wikidatawiki/entities/latest-lexemes.ttl.bz2 2>&1

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  1  106G    1 1474M    0     0  4019k      0  7:40:53  0:06:15  7:34:38 4125k
...
100  106G  100  106G    0     0  3958k      0  7:47:56  7:47:56 --:--:-- 3190k
Download successful, total file size: 114,328,083,715 bytes
docker run --rm -u $(id -u):$(id -g) -v /etc/localtime:/etc/localtime:ro -v $(pwd):/index -w /index --init --entrypoint bash --name qlever.index.wikidata docker.io/adfreiburg/qlever:latest -c 'ulimit -Sn 1048576; lbzcat -n 4 -f latest-all.ttl.bz2 latest-lexemes.ttl.bz2 | IndexBuilderMain -F ttl -f - -i wikidata -s wikidata.settings.json --stxxl-memory 10G | tee wikidata.index-log.txt'

2024-10-17 17:32:57.964 - INFO: QLever IndexBuilder, compiled on Wed Oct 16 23:27:38 UTC 2024 using git hash 4acbca
2024-10-17 17:32:57.967 - INFO: You specified "locale = en_US" and "ignore-punctuation = 1"
2024-10-17 17:32:57.967 - INFO: You specified "ascii-prefixes-only = true", which enables faster parsing for well-behaved TTL files
2024-10-17 17:32:57.967 - INFO: You specified "num-triples-per-batch = 5,000,000", choose a lower value if the index builder runs out of memory
2024-10-17 17:32:57.967 - INFO: By default, integers that cannot be represented by QLever will throw an exception
2024-10-17 17:32:57.967 - INFO: Processing triples from /dev/stdin ...
2024-10-17 17:32:57.969 - INFO: Parsing input triples and creating partial vocabularies, one per batch ...
2024-10-17 17:33:26.265 - INFO: Triples parsed: 10,000,000 
...
2024-10-17 20:10:00.386 - INFO: Triples parsed: 3,650,000,000 [average speed 0.4 M/s, fastest 0.6 M/s, slowest 0.3 M/s]
2024-10-18 05:30:27.974 - INFO: Triples parsed: 16,640,000,000
2024-10-18 08:23:20.447 - INFO: Triples parsed: 20,621,004,970 
2024-10-18 08:23:20.707 - INFO: Merging partial vocabularies ...
2024-10-18 08:23:40.754 - INFO: Words merged: 10,000,000 
2024-10-18 09:37:48.143 - INFO: Words merged: 3,688,975,165
2024-10-18 09:37:57.291 - INFO: Finished writing compressed internal vocabulary, size = 72.2 GB [uncompressed = 227.7 GB, ratio = 31%]
2024-10-18 09:37:57.298 - INFO: Number of words in external vocabulary: 3,688,975,165
2024-10-18 09:38:23.515 - INFO: Converting triples from local IDs to global IDs ...
2024-10-18 09:38:26.130 - INFO: Triples converted: 10,000,000 [average speed 13.
2024-10-18 09:57:46.509 - INFO: Triples converted: 15,090,000,000 
2024-10-18 10:20:04.517 - INFO: Triples converted: 33,088,914,076 [average speed 13.2 M/s, last batch 16.7 M/s, fastest 22.9 M/s, slowest 6.6 M/s] 
2024-10-18 10:20:14.777 - INFO: Creating permutations SPO and SOP ...
2024-10-18 10:20:22.470 - INFO: Triples sorted: 10,000,000
2024-10-18 10:40:45.236 - INFO: Triples sorted: 9,680,000,000 
...
2024-10-18 11:04:44.521 - INFO: Triples sorted: 20,265,372,683 [average speed 7.6 M/s, last batch 2.3 M/s, fastest 37.0 M/s, slowest 1.3 M/s] 
2024-10-18 11:04:45.875 - INFO: Statistics for SPO: #relations = 2,158,261,625, #blocks = 432,805, #triples = 20,265,372,683
2024-10-18 11:04:45.877 - INFO: Statistics for SOP: #relations = 2,158,261,625, #blocks = 432,805, #triples = 20,265,372,683
2024-10-18 11:04:56.150 - INFO: Number of distinct patterns: 9,470,289
2024-10-18 11:04:56.150 - INFO: Number of subjects with pattern: 2,158,261,625 [all]
2024-10-18 11:04:56.150 - INFO: Total number of distinct subject-predicate pairs: 12,022,246,169
2024-10-18 11:04:56.150 - INFO: Average number of predicates per subject: 5.6
2024-10-18 11:04:56.150 - INFO: Average number of subjects per predicate: 209,272
2024-10-18 11:05:07.943 - INFO: Creating permutations OSP and OPS ...
2024-10-18 11:05:20.574 - INFO: Triples sorted: 10,000,000 
2024-10-18 12:08:42.508 - INFO: Triples sorted: 14,810,000,000 
2024-10-18 14:59:02.433 - INFO: Triples sorted: 20,265,372,683 [average speed 3.3 M/s, last batch 6.6 M/s, fastest 12.9 M/s, slowest 0.0 M/s] 
2024-10-18 14:59:03.078 - INFO: Statistics for PSO: #relations = 57,448, #blocks = 652,524, #triples = 20,265,372,683
2024-10-18 14:59:03.078 - INFO: Statistics for POS: #relations = 57,448, #blocks = 652,524, #triples = 20,265,372,683
2024-10-18 14:59:16.597 - INFO: Index build completed


To enable autocompletion, run the following command, and consider adding it to your `.bashrc` or `.zshrc`:

eval "$(register-python-argcomplete qlever)" && export QLEVER_ARGCOMPLETE_ENABLED=1


Command: start

docker run -d --restart=unless-stopped -u $(id -u):$(id -g) -v /etc/localtime:/etc/localtime:ro -v $(pwd):/index -p 7001:7001 -w /index --init --entrypoint bash --name qlever.server.wikidata docker.io/adfreiburg/qlever:latest -c 'ServerMain -i wikidata -j 8 -p 7001 -m 20G -c 10G -e 1G -k 200 -s 30s -a wikidata_GtdRwNQw9y4x > wikidata.server-log.txt 2>&1'

Starting the QLever server failed (docker: Error response from daemon: Conflict. The container name "/qlever.server.wikidata" is already in use by container "af4497fc45d1d344b5de660c8ff13bedb8d1c1b41154da517ba092715281d7fa". You have to remove (or rename) that container to be able to reuse that name. See 'docker run --help'.)

QLever indexing process completed at Fri Oct 18 02:59:23 PM CEST 2024