Wikidata Import 2025-12-13
Jump to navigation
Jump to search
Import
| Import | |
|---|---|
| state | ✅ |
| url | https://wiki.bitplan.com/index.php/Wikidata_Import_2025-12-13 |
| target | blazegraph |
| start | 2025-12-07 |
| end | 2025-12-13 |
| days | 6 |
| os | Ubuntu 22.04.5 LTS |
| cpu | AMD Ryzen 9 5900X 12-Core Processor |
| ram | 128 |
| triples | |
| comment | seeded with 1.3 TB data.jnl file originally provided by James Hare see https://phabricator.wikimedia.org/T403627 |
see also Wikidata_Import_2025-06-06
jnlget
#!/bin/bash
# download and reassemble wikidata blazegraph journal file
# see https://phabricator.wikimedia.org/T403627
# WF 2025-12-12
# Configuration
BASE_URL=https://files.scatter.red/orb/2025/12
PREFIX=2025-12-07-wikidata-data.jnl.gz.0000
MD5SUMS=md5sums.txt
# colored messages
HASH_MESSAGES_FILE="$HOME/source/bash/mp/bash_messages"
if [ -f "$HASH_MESSAGES_FILE" ]; then
# shellcheck disable=SC1090
source "$HASH_MESSAGES_FILE"
else
# Fallback
error() { echo "Error: $1" >&2; [ "${2:-1}" -ne 0 ] && exit "${2:-1}"; }
success() { echo "Success: $1"; }
action() { echo "Action: $1"; }
warn() { echo "Warning: $1"; }
fi
# show usage
usage() {
cat <<EOF
Usage $0 [OPTIONS]
Options:
-h, --help Show this help message
--download Download all files
-l, --list Show the file list
--single [number] Get a single file with the given number
--cat reassemble the original file
EOF
exit 1
}
# Gets the Content-Length of a remote file
# params
# 1: url - file size of the given url
get_remote_filesize() {
local url="$1"
# -s: Silent
# -I: Header only
# -L: Follow redirects
curl -sIL "$url" | grep -i "Content-Length" | tail -n 1 | awk '{print $2}' | tr -d '\r'
}
# get the filesize
# params
# 1: file - the name of the file
get_local_filesize() {
local file="$1"
stat -c%s "$file"
}
# create a filesize marker
# params:
# 1: file - the file to get the size for
mark_filesize() {
local file="$1"
if [ ! -f $file.size ]
then
local file_size=$(get_remote_filesize $BASE_URL/$file)
echo $file_size>$file.size
fi
cat $file.size
}
# calculate md5 hash and compare to expected - creates marker file if ok
# params:
# 1: filename
# 2: expected hash
md5_and_mark() {
local filename="$1"
local expected_filesize="$2"
#local expected_hash=$(grep $filename $MD5SUMS | cut -c1-32)
local expected_hash=$(awk -v fn="$filename" '$2 == fn {print $1}' $MD5SUMS)
local marker_file="${filename}.md5"
if [ -f $marker_file ]
then
success "past $filename size and hash "
return
fi
action "checking md5 for $filename of size $expected_filesize to be $expected_hash"
local filesize=$(get_local_filesize $filename)
if [ "$expected_filesize" != "$filesize" ]
then
error "size mismatch for $filename: expected $expected_filesize but is $filesize"
else
local hash=$(md5sum "${filename}" | awk '{print $1}')
if [ "$hash" != "$expected_hash" ]
then
error "hash mismatch for $filename: expected $expected_hash but is $hash"
else
echo $hash > $marker_file
success "$filename size and hash"
fi
fi
}
# get a single file
# params:
# 1: file - the file to download
get_file() {
local file="$1"
if [ ! -f $file ]
then
action "downloading $file"
wget $BASE_URL/$file
else
success "$file exists"
fi
}
# get a single file
# params:
# 1: file - the file to download
get_single_file() {
local file="$1"
local expected_filesize=$(mark_filesize "$file")
get_file "$file"
md5_and_mark "$file" $expected_filesize
}
# get the complete file list
file_list() {
# reverse
#cat $MD5SUMS | grep "\.0" | cut -c35- | sort -r
# forward
cat $MD5SUMS | grep "\.0" | cut -c35-
}
# download all files from the file list
download() {
for file in $(file_list)
do
get_single_file $file
done
}
# Add after the download() function, before CLI parsing:
# get the target filename from the first file in the list
get_target_filename() {
file_list | head -n 1 | sed 's/\.[0-9]\+$//'
}
# reassemble
recat() {
local target=$(get_target_filename)
action "reassembling downloads into $target"
local expected_filesize=$(mark_filesize "$target")
# verify all parts are downloaded and verified
local missing=0
for file in $(file_list)
do
if [ ! -f "$file.md5" ]
then
warn "$file not verified yet"
missing=1
fi
done
if [ $missing -eq 1 ]
then
error "not all parts verified - run --download first"
fi
# concatenate parts in order
if [ -f "$target" ]
then
warn "$target exists will not override"
else
pv -petrab $(file_list) > "$target"
fi
if [ ! -s "$target" ]
then
error "reassembly failed - target file empty"
fi
local assembled_size=$(get_local_filesize "$target")
success "reassembled $target ($assembled_size bytes)"
md5_and_mark "$target" $expected_filesize
}
# Parse command-line arguments
if [[ $# -eq 0 ]]; then
usage
fi
# main command-line-interface loop
while [[ $# -gt 0 ]]; do
case $1 in
-h|--help) usage ;;
-l|--list)
get_file $MD5SUMS
file_list
;;
--download)
get_file $MD5SUMS
download
;;
--single)
shift
if [[ $# -eq 0 ]]; then
error "single needs file number"
fi
number=$1
file=$PREFIX$1
get_single_file $file
;;
--cat)
shift
get_file $MD5SUMS
recat
;;
esac
shift
done
usage
./jnlget -h
Usage ./jnlget [OPTIONS]
Options:
-h, --help Show this help message
--download Download all files
-l, --list Show the file list
--single [number] Get a single file with the given number
--cat reassemble the original file
stepwise download
You might want to try what amount of parallel downloads works for you. We had mixed results from less than 1 MB / sec to 35 MB/sec on the 10 GBit line on the RWTH Aachen server.
for i in {00..19}; do nohup ./jnlget --single $i > jnlget_$i.out 2>&1 & done
for i in {20..39}; do nohup ./jnlget --single $i > jnlget_$i.out 2>&1 & done
for i in {40..59}; do nohup ./jnlget --single $i > jnlget_$i.out 2>&1 & done
for i in {60..81}; do nohup ./jnlget --single $i > jnlget_$i.out 2>&1 & done
file examples
-rw-rw-r-- 1 wf wf 5368709120 Dec 7 16:54 2025-12-07-wikidata-data.jnl.gz.000000
-rw-rw-r-- 1 wf wf 33 Dec 13 21:14 2025-12-07-wikidata-data.jnl.gz.000000.md5
-rw-rw-r-- 1 wf wf 11 Dec 13 20:59 2025-12-07-wikidata-data.jnl.gz.000000.size
cat 2025-12-07-wikidata-data.jnl.gz.000000.md5
74b3e57fd54d12e3a83b8308b092c813
cat 2025-12-07-wikidata-data.jnl.gz.000000.size
5368709120
./jnlget --single 00
✓ 2025-12-07-wikidata-data.jnl.gz.000000 exists
✓ past 2025-12-07-wikidata-data.jnl.gz.000000 size and hash
reassemble
./jnlget --cat
✓ md5sums.txt exists
➜ reassembling downloads into 2025-12-07-wikidata-data.jnl.gz
⚠ 2025-12-07-wikidata-data.jnl.gz exists will not override
✓ reassembled 2025-12-07-wikidata-data.jnl.gz (438103694405 bytes)
➜ checking md5 for 2025-12-07-wikidata-data.jnl.gz of size 438103694405 to be ad0006a38103efd715c782a539a6f482
✓ 2025-12-07-wikidata-data.jnl.gz size and hash
unpack
cd /hd/alpha/blazegraph/get-your-own-wdqs/data
pv -petrab /hd/gamma/wikidata/2025-12-07-wikidata-data.jnl.gz | gunzip > data.jnl
639MiB 0:00:14 [46.5MiB/s] [45.7MiB/s] [> ] 0% ETA 2:32:10
408GiB 2:24:26 [48.2MiB/s] [48.2MiB/s] [===================================================================================>] 100%
ls -l
total 1236219648
-rw-rw-r-- 1 wf wf 1265888788480 Dec 13 23:50 data.jnl
launch
./launch "RWTH Aachen i5 Wikidata Query Service"
[+] Running 4/4
✔ Network get-your-own-wdqs_default Created 0.1s
✔ Container get-your-own-wdqs-wdqs-1 Started 0.4s
✔ Container get-your-own-wdqs-wdqs-proxy-1 Started 0.6s
✔ Container get-your-own-wdqs-wdqs-frontend-1 Started 1.0s
https://www.eclipse.org/jetty/documentation/
2025-12-13 22:52:29.641:INFO:oejr.Runner:main: Runner
2025-12-13 22:52:29.743:INFO:oejs.Server:main: jetty-9.4.12.v20180830; built: 2018-08-30T13:59:14.071Z; git: 27208684755d94a92186989f695db2d7b21ebc51; jvm 1.8.0_212-b04
ENABLE_UPDATE_LOOP=true