Difference between revisions of "Wikidata Import 2025-12-13"
Jump to navigation
Jump to search
| (One intermediate revision by the same user not shown) | |||
| Line 10: | Line 10: | ||
|end=2025-12-13 | |end=2025-12-13 | ||
|days=6 | |days=6 | ||
| − | |os=Ubuntu 22.04. | + | |os=Ubuntu 22.04.3 LTS |
| − | |cpu= | + | |cpu=Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz (16 cores) |
| − | |ram= | + | |ram=512 |
|storemode=property | |storemode=property | ||
| − | |triples= | + | |triples=17072447568 |
|comment=seeded with 1.3 TB data.jnl file originally provided by James Hare see https://phabricator.wikimedia.org/T403627 | |comment=seeded with 1.3 TB data.jnl file originally provided by James Hare see https://phabricator.wikimedia.org/T403627 | ||
}} | }} | ||
Latest revision as of 14:21, 14 December 2025
Import
| Import | |
|---|---|
| state | ✅ |
| url | https://wiki.bitplan.com/index.php/Wikidata_Import_2025-12-13 |
| target | blazegraph |
| start | 2025-12-07 |
| end | 2025-12-13 |
| days | 6 |
| os | Ubuntu 22.04.3 LTS |
| cpu | Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz (16 cores) |
| ram | 512 |
| triples | 17072447568 |
| comment | seeded with 1.3 TB data.jnl file originally provided by James Hare see https://phabricator.wikimedia.org/T403627 |
see also Wikidata_Import_2025-06-06
jnlget
#!/bin/bash
# download and reassemble wikidata blazegraph journal file
# see https://phabricator.wikimedia.org/T403627
# WF 2025-12-12
# Configuration
BASE_URL=https://files.scatter.red/orb/2025/12
PREFIX=2025-12-07-wikidata-data.jnl.gz.0000
MD5SUMS=md5sums.txt
# colored messages
HASH_MESSAGES_FILE="$HOME/source/bash/mp/bash_messages"
if [ -f "$HASH_MESSAGES_FILE" ]; then
# shellcheck disable=SC1090
source "$HASH_MESSAGES_FILE"
else
# Fallback
error() { echo "Error: $1" >&2; [ "${2:-1}" -ne 0 ] && exit "${2:-1}"; }
success() { echo "Success: $1"; }
action() { echo "Action: $1"; }
warn() { echo "Warning: $1"; }
fi
# show usage
usage() {
cat <<EOF
Usage $0 [OPTIONS]
Options:
-h, --help Show this help message
--download Download all files
-l, --list Show the file list
--single [number] Get a single file with the given number
--cat reassemble the original file
EOF
exit 1
}
# Gets the Content-Length of a remote file
# params
# 1: url - file size of the given url
get_remote_filesize() {
local url="$1"
# -s: Silent
# -I: Header only
# -L: Follow redirects
curl -sIL "$url" | grep -i "Content-Length" | tail -n 1 | awk '{print $2}' | tr -d '\r'
}
# get the filesize
# params
# 1: file - the name of the file
get_local_filesize() {
local file="$1"
stat -c%s "$file"
}
# create a filesize marker
# params:
# 1: file - the file to get the size for
mark_filesize() {
local file="$1"
if [ ! -f $file.size ]
then
local file_size=$(get_remote_filesize $BASE_URL/$file)
echo $file_size>$file.size
fi
cat $file.size
}
# calculate md5 hash and compare to expected - creates marker file if ok
# params:
# 1: filename
# 2: expected hash
md5_and_mark() {
local filename="$1"
local expected_filesize="$2"
#local expected_hash=$(grep $filename $MD5SUMS | cut -c1-32)
local expected_hash=$(awk -v fn="$filename" '$2 == fn {print $1}' $MD5SUMS)
local marker_file="${filename}.md5"
if [ -f $marker_file ]
then
success "past $filename size and hash "
return
fi
action "checking md5 for $filename of size $expected_filesize to be $expected_hash"
local filesize=$(get_local_filesize $filename)
if [ "$expected_filesize" != "$filesize" ]
then
error "size mismatch for $filename: expected $expected_filesize but is $filesize"
else
local hash=$(md5sum "${filename}" | awk '{print $1}')
if [ "$hash" != "$expected_hash" ]
then
error "hash mismatch for $filename: expected $expected_hash but is $hash"
else
echo $hash > $marker_file
success "$filename size and hash"
fi
fi
}
# get a single file
# params:
# 1: file - the file to download
get_file() {
local file="$1"
if [ ! -f $file ]
then
action "downloading $file"
wget $BASE_URL/$file
else
success "$file exists"
fi
}
# get a single file
# params:
# 1: file - the file to download
get_single_file() {
local file="$1"
local expected_filesize=$(mark_filesize "$file")
get_file "$file"
md5_and_mark "$file" $expected_filesize
}
# get the complete file list
file_list() {
# reverse
#cat $MD5SUMS | grep "\.0" | cut -c35- | sort -r
# forward
cat $MD5SUMS | grep "\.0" | cut -c35-
}
# download all files from the file list
download() {
for file in $(file_list)
do
get_single_file $file
done
}
# Add after the download() function, before CLI parsing:
# get the target filename from the first file in the list
get_target_filename() {
file_list | head -n 1 | sed 's/\.[0-9]\+$//'
}
# reassemble
recat() {
local target=$(get_target_filename)
action "reassembling downloads into $target"
local expected_filesize=$(mark_filesize "$target")
# verify all parts are downloaded and verified
local missing=0
for file in $(file_list)
do
if [ ! -f "$file.md5" ]
then
warn "$file not verified yet"
missing=1
fi
done
if [ $missing -eq 1 ]
then
error "not all parts verified - run --download first"
fi
# concatenate parts in order
if [ -f "$target" ]
then
warn "$target exists will not override"
else
pv -petrab $(file_list) > "$target"
fi
if [ ! -s "$target" ]
then
error "reassembly failed - target file empty"
fi
local assembled_size=$(get_local_filesize "$target")
success "reassembled $target ($assembled_size bytes)"
md5_and_mark "$target" $expected_filesize
}
# Parse command-line arguments
if [[ $# -eq 0 ]]; then
usage
fi
# main command-line-interface loop
while [[ $# -gt 0 ]]; do
case $1 in
-h|--help) usage ;;
-l|--list)
get_file $MD5SUMS
file_list
;;
--download)
get_file $MD5SUMS
download
;;
--single)
shift
if [[ $# -eq 0 ]]; then
error "single needs file number"
fi
number=$1
file=$PREFIX$1
get_single_file $file
;;
--cat)
shift
get_file $MD5SUMS
recat
;;
esac
shift
done
usage
./jnlget -h
Usage ./jnlget [OPTIONS]
Options:
-h, --help Show this help message
--download Download all files
-l, --list Show the file list
--single [number] Get a single file with the given number
--cat reassemble the original file
stepwise download
You might want to try what amount of parallel downloads works for you. We had mixed results from less than 1 MB / sec to 35 MB/sec on the 10 GBit line on the RWTH Aachen server.
for i in {00..19}; do nohup ./jnlget --single $i > jnlget_$i.out 2>&1 & done
for i in {20..39}; do nohup ./jnlget --single $i > jnlget_$i.out 2>&1 & done
for i in {40..59}; do nohup ./jnlget --single $i > jnlget_$i.out 2>&1 & done
for i in {60..81}; do nohup ./jnlget --single $i > jnlget_$i.out 2>&1 & done
file examples
-rw-rw-r-- 1 wf wf 5368709120 Dec 7 16:54 2025-12-07-wikidata-data.jnl.gz.000000
-rw-rw-r-- 1 wf wf 33 Dec 13 21:14 2025-12-07-wikidata-data.jnl.gz.000000.md5
-rw-rw-r-- 1 wf wf 11 Dec 13 20:59 2025-12-07-wikidata-data.jnl.gz.000000.size
cat 2025-12-07-wikidata-data.jnl.gz.000000.md5
74b3e57fd54d12e3a83b8308b092c813
cat 2025-12-07-wikidata-data.jnl.gz.000000.size
5368709120
./jnlget --single 00
✓ 2025-12-07-wikidata-data.jnl.gz.000000 exists
✓ past 2025-12-07-wikidata-data.jnl.gz.000000 size and hash
reassemble
./jnlget --cat
✓ md5sums.txt exists
➜ reassembling downloads into 2025-12-07-wikidata-data.jnl.gz
⚠ 2025-12-07-wikidata-data.jnl.gz exists will not override
✓ reassembled 2025-12-07-wikidata-data.jnl.gz (438103694405 bytes)
➜ checking md5 for 2025-12-07-wikidata-data.jnl.gz of size 438103694405 to be ad0006a38103efd715c782a539a6f482
✓ 2025-12-07-wikidata-data.jnl.gz size and hash
unpack
cd /hd/alpha/blazegraph/get-your-own-wdqs/data
pv -petrab /hd/gamma/wikidata/2025-12-07-wikidata-data.jnl.gz | gunzip > data.jnl
639MiB 0:00:14 [46.5MiB/s] [45.7MiB/s] [> ] 0% ETA 2:32:10
408GiB 2:24:26 [48.2MiB/s] [48.2MiB/s] [===================================================================================>] 100%
ls -l
total 1236219648
-rw-rw-r-- 1 wf wf 1265888788480 Dec 13 23:50 data.jnl
launch
./launch "RWTH Aachen i5 Wikidata Query Service"
[+] Running 4/4
✔ Network get-your-own-wdqs_default Created 0.1s
✔ Container get-your-own-wdqs-wdqs-1 Started 0.4s
✔ Container get-your-own-wdqs-wdqs-proxy-1 Started 0.6s
✔ Container get-your-own-wdqs-wdqs-frontend-1 Started 1.0s
https://www.eclipse.org/jetty/documentation/
2025-12-13 22:52:29.641:INFO:oejr.Runner:main: Runner
2025-12-13 22:52:29.743:INFO:oejs.Server:main: jetty-9.4.12.v20180830; built: 2018-08-30T13:59:14.071Z; git: 27208684755d94a92186989f695db2d7b21ebc51; jvm 1.8.0_212-b04
ENABLE_UPDATE_LOOP=true
updater
need fixed https://github.com/scatter-llc/private-wikidata-query/issues/10
https://github.com/WolfgangFahl/get-your-own-wdqs
08:23:13.115 [main] INFO o.w.q.r.t.change.RecentChangesPoller - Got 84 changes, from Q87995574@2439029761@20251207022049|2513998643 to Q106782689@2439029861@20251207022121|2513998742 08:23:18.400 [main] INFO org.wikidata.query.rdf.tool.Updater - Polled up to 2025-12-07T02:21:21Z (next: 20251207022122|2513998743) at (9.6, 5.1, 2.2) updates per second and (9611.9, 4693.4, 1949.3) milliseconds per second 08:23:18.697 [main] INFO o.w.q.r.t.change.RecentChangesPoller - Got 82 changes, from Q29865266@2439029862@20251207022122|2513998743 to Q137262275@2439029962@20251207022201|2513998846 ... 09:32:22.102 [main] INFO org.wikidata.query.rdf.tool.Updater - Polled up to 2025-12-07T08:47:15Z (next: 20251207084715|2514083824) at (15.1, 12.9, 12.5) updates per second and (4178.7, 3544.2, 4163.0) milliseconds per second 09:32:22.366 [main] INFO o.w.q.r.t.change.RecentChangesPoller - Got 92 changes, from Q124335039@2439112863@20251207084715|2514083824 to Q124340206@2439112964@20251207084737|2514083926 ... 10:13:08.690 [main] INFO org.wikidata.query.rdf.tool.Updater - Polled up to 2025-12-07T11:27:23Z (next: 20251207112723|2514137723) at (14.9, 14.7, 14.8) updates per second and (3701.0, 3826.5, 3802.8) milliseconds per second
Wikidata Catchup Calculation
Current lag: 7 days = 604,800 seconds
Processing rate: 3.8x real-time
Gain per real second: 3.8 - 1 = 2.8 seconds
Time to catch up:
604,800 ÷ 2.8 = 216,000 seconds 216,000 ÷ 3600 = 60 hours 60 ÷ 24 = 2.5 days
ETA: Monday Dec 16 around midnight