Wikidata Import 2025-12-13

From BITPlan Wiki
Revision as of 08:35, 14 December 2025 by Wf (talk | contribs) (→‎launch)
Jump to navigation Jump to search

Import

Import
edit
state  ✅
url  https://wiki.bitplan.com/index.php/Wikidata_Import_2025-12-13
target  blazegraph
start  2025-12-07
end  2025-12-13
days  6
os  Ubuntu 22.04.5 LTS
cpu  AMD Ryzen 9 5900X 12-Core Processor
ram  128
triples  
comment  seeded with 1.3 TB data.jnl file originally provided by James Hare see https://phabricator.wikimedia.org/T403627

see also Wikidata_Import_2025-06-06

jnlget

#!/bin/bash
# download and reassemble wikidata blazegraph journal file 
# see https://phabricator.wikimedia.org/T403627
# WF 2025-12-12

# Configuration
BASE_URL=https://files.scatter.red/orb/2025/12
PREFIX=2025-12-07-wikidata-data.jnl.gz.0000
MD5SUMS=md5sums.txt

# colored messages
HASH_MESSAGES_FILE="$HOME/source/bash/mp/bash_messages"

if [ -f "$HASH_MESSAGES_FILE" ]; then
  # shellcheck disable=SC1090
  source "$HASH_MESSAGES_FILE"
else
  # Fallback
  error() { echo "Error: $1" >&2; [ "${2:-1}" -ne 0 ] && exit "${2:-1}"; }
  success() { echo "Success: $1"; }
  action() { echo "Action: $1"; }
  warn() { echo "Warning: $1"; }
fi

# show usage
usage() { 
  cat <<EOF 
Usage $0 [OPTIONS]
  Options:
    -h, --help        Show this help message 
    --download        Download all files
    -l, --list        Show the file list 
    --single [number] Get a single file with the given number
    --cat             reassemble the original file
EOF
  exit 1
}



# Gets the Content-Length of a remote file 
# params
#  1: url - file size of the given url
get_remote_filesize() {
    local url="$1"
    # -s: Silent
    # -I: Header only
    # -L: Follow redirects
    curl -sIL "$url" | grep -i "Content-Length" | tail -n 1 | awk '{print $2}' | tr -d '\r'
}

# get the filesize
# params
#  1: file - the name of the file
get_local_filesize() {
   local file="$1"
   stat -c%s "$file"
}

# create a filesize marker
# params:
#   1: file - the file  to get the size for
mark_filesize() {
  local file="$1"
  if [ ! -f $file.size ]
  then
    local file_size=$(get_remote_filesize $BASE_URL/$file)
    echo $file_size>$file.size
  fi
  cat $file.size
}

# calculate md5 hash and compare to expected - creates marker file if ok
# params:
#   1: filename
#   2: expected hash
md5_and_mark() {
  local filename="$1"
  local expected_filesize="$2"
  #local expected_hash=$(grep $filename $MD5SUMS | cut -c1-32)
  local expected_hash=$(awk -v fn="$filename" '$2 == fn {print $1}' $MD5SUMS)
  local marker_file="${filename}.md5"
  if [ -f $marker_file ]
  then
    success "past $filename size and hash "
    return
  fi
  action "checking md5 for $filename of size $expected_filesize to be $expected_hash"
  local filesize=$(get_local_filesize $filename)
  if [ "$expected_filesize" != "$filesize" ] 
  then
     error "size mismatch for $filename: expected $expected_filesize but is $filesize"
  else
     local hash=$(md5sum "${filename}" | awk '{print $1}')
     if [ "$hash" != "$expected_hash" ] 
     then
        error "hash mismatch for $filename: expected $expected_hash but is $hash"
     else
	echo $hash > $marker_file
	success "$filename size and hash"
     fi
  fi
}

# get a single file
# params:
#   1: file - the file to download
get_file() {
  local file="$1"
  if [ ! -f $file ]
  then
    action "downloading $file"
    wget $BASE_URL/$file
  else
    success "$file exists"
  fi
}

# get a single file
# params:
#   1: file - the file to download
get_single_file() {
  local file="$1"
  local expected_filesize=$(mark_filesize "$file")
  get_file "$file"
  md5_and_mark "$file" $expected_filesize
}


# get the complete file list
file_list() {
 # reverse
 #cat $MD5SUMS | grep "\.0" | cut -c35- | sort -r
 # forward
 cat $MD5SUMS | grep "\.0" | cut -c35- 
}

# download all files from the file list
download() {
  for file in $(file_list)
  do
    get_single_file $file
  done
} 

# Add after the download() function, before CLI parsing:

# get the target filename from the first file in the list
get_target_filename() {
  file_list | head -n 1 | sed 's/\.[0-9]\+$//'
}

# reassemble
recat() {
  local target=$(get_target_filename)
  action "reassembling downloads into $target"
  local expected_filesize=$(mark_filesize "$target")

  # verify all parts are downloaded and verified
  local missing=0
  for file in $(file_list)
  do
    if [ ! -f "$file.md5" ]
    then
      warn "$file not verified yet"
      missing=1
    fi
  done

  if [ $missing -eq 1 ]
  then
    error "not all parts verified - run --download first"
  fi

  # concatenate parts in order
  if [ -f "$target" ]
  then
    warn "$target exists will not override"
  else
    pv -petrab $(file_list) > "$target"
  fi

  if [ ! -s "$target" ]
  then
    error "reassembly failed - target file empty"
  fi

  local assembled_size=$(get_local_filesize "$target")
  success "reassembled $target ($assembled_size bytes)"
  md5_and_mark "$target" $expected_filesize
}

# Parse command-line arguments 
if [[ $# -eq 0 ]]; then
  usage
fi

# main command-line-interface loop
while [[ $# -gt 0 ]]; do
  case $1 in
    -h|--help) usage ;;
    -l|--list) 
      get_file $MD5SUMS
      file_list
      ;;
    --download) 
      get_file $MD5SUMS
      download
      ;;
    --single)
      shift
      if [[ $# -eq 0 ]]; then
	error "single needs file number"
      fi
      number=$1
      file=$PREFIX$1
      get_single_file $file
      ;;
    --cat)
      shift
      get_file $MD5SUMS
      recat 
    ;;
  esac
  shift
done

usage

./jnlget -h
Usage ./jnlget [OPTIONS]
  Options:
    -h, --help        Show this help message 
    --download        Download all files
    -l, --list        Show the file list 
    --single [number] Get a single file with the given number
    --cat             reassemble the original file

stepwise download

You might want to try what amount of parallel downloads works for you. We had mixed results from less than 1 MB / sec to 35 MB/sec on the 10 GBit line on the RWTH Aachen server.

for i in {00..19}; do nohup ./jnlget --single $i > jnlget_$i.out 2>&1 & done
for i in {20..39}; do nohup ./jnlget --single $i > jnlget_$i.out 2>&1 & done
for i in {40..59}; do nohup ./jnlget --single $i > jnlget_$i.out 2>&1 & done
for i in {60..81}; do nohup ./jnlget --single $i > jnlget_$i.out 2>&1 & done

file examples

-rw-rw-r-- 1 wf wf 5368709120 Dec  7 16:54 2025-12-07-wikidata-data.jnl.gz.000000
-rw-rw-r-- 1 wf wf         33 Dec 13 21:14 2025-12-07-wikidata-data.jnl.gz.000000.md5
-rw-rw-r-- 1 wf wf         11 Dec 13 20:59 2025-12-07-wikidata-data.jnl.gz.000000.size
cat 2025-12-07-wikidata-data.jnl.gz.000000.md5 
74b3e57fd54d12e3a83b8308b092c813
cat 2025-12-07-wikidata-data.jnl.gz.000000.size
5368709120
./jnlget --single 00
2025-12-07-wikidata-data.jnl.gz.000000 exists
✓ past 2025-12-07-wikidata-data.jnl.gz.000000 size and hash

reassemble

./jnlget --cat
✓ md5sums.txt exists
➜ reassembling downloads into 2025-12-07-wikidata-data.jnl.gz
⚠ 2025-12-07-wikidata-data.jnl.gz exists will not override
✓ reassembled 2025-12-07-wikidata-data.jnl.gz (438103694405 bytes)
➜ checking md5 for 2025-12-07-wikidata-data.jnl.gz of size 438103694405 to be ad0006a38103efd715c782a539a6f482
✓ 2025-12-07-wikidata-data.jnl.gz size and hash

unpack

cd /hd/alpha/blazegraph/get-your-own-wdqs/data
 pv -petrab /hd/gamma/wikidata/2025-12-07-wikidata-data.jnl.gz | gunzip > data.jnl
 639MiB 0:00:14 [46.5MiB/s] [45.7MiB/s] [>                                                        ]  0% ETA 2:32:10
 408GiB 2:24:26 [48.2MiB/s] [48.2MiB/s] [===================================================================================>] 100%  
ls -l
total 1236219648
-rw-rw-r-- 1 wf wf 1265888788480 Dec 13 23:50 data.jnl

launch

./launch 
[+] Running 4/4
 ✔ Network get-your-own-wdqs_default            Created                                                                                   0.1s 
 ✔ Container get-your-own-wdqs-wdqs-1           Started                                                                                   0.4s 
 ✔ Container get-your-own-wdqs-wdqs-proxy-1     Started                                                                                   0.6s 
 ✔ Container get-your-own-wdqs-wdqs-frontend-1  Started                                                                                   1.0s 
         https://www.eclipse.org/jetty/documentation/
2025-12-13 22:52:29.641:INFO:oejr.Runner:main: Runner
2025-12-13 22:52:29.743:INFO:oejs.Server:main: jetty-9.4.12.v20180830; built: 2018-08-30T13:59:14.071Z; git: 27208684755d94a92186989f695db2d7b21ebc51; jvm 1.8.0_212-b04
ENABLE_UPDATE_LOOP=true

updater