Difference between revisions of "Wikidata Import 2025-12-13"
Jump to navigation
Jump to search
| Line 17: | Line 17: | ||
|comment=seeded with 1.3 TB data.jnl file originally provided by James Hare see https://phabricator.wikimedia.org/T403627 | |comment=seeded with 1.3 TB data.jnl file originally provided by James Hare see https://phabricator.wikimedia.org/T403627 | ||
}} | }} | ||
| + | == jnlget == | ||
| + | <source lang='bash'> | ||
| + | #!/bin/bash | ||
| + | # download and reassemble wikidata blazegraph journal file | ||
| + | # see https://phabricator.wikimedia.org/T403627 | ||
| + | # WF 2025-12-12 | ||
| + | |||
| + | # Configuration | ||
| + | BASE_URL=https://files.scatter.red/orb/2025/12 | ||
| + | PREFIX=2025-12-07-wikidata-data.jnl.gz.0000 | ||
| + | MD5SUMS=md5sums.txt | ||
| + | |||
| + | # colored messages | ||
| + | HASH_MESSAGES_FILE="$HOME/source/bash/mp/bash_messages" | ||
| + | |||
| + | if [ -f "$HASH_MESSAGES_FILE" ]; then | ||
| + | # shellcheck disable=SC1090 | ||
| + | source "$HASH_MESSAGES_FILE" | ||
| + | else | ||
| + | # Fallback | ||
| + | error() { echo "Error: $1" >&2; [ "${2:-1}" -ne 0 ] && exit "${2:-1}"; } | ||
| + | success() { echo "Success: $1"; } | ||
| + | action() { echo "Action: $1"; } | ||
| + | warn() { echo "Warning: $1"; } | ||
| + | fi | ||
| + | |||
| + | # show usage | ||
| + | usage() { | ||
| + | cat <<EOF | ||
| + | Usage $0 [OPTIONS] | ||
| + | Options: | ||
| + | -h, --help Show this help message | ||
| + | --download Download all files | ||
| + | -l, --list Show the file list | ||
| + | --single [number] Get a single file with the given number | ||
| + | --cat reassemble the original file | ||
| + | EOF | ||
| + | exit 1 | ||
| + | } | ||
| + | |||
| + | |||
| + | |||
| + | # Gets the Content-Length of a remote file | ||
| + | # params | ||
| + | # 1: url - file size of the given url | ||
| + | get_remote_filesize() { | ||
| + | local url="$1" | ||
| + | # -s: Silent | ||
| + | # -I: Header only | ||
| + | # -L: Follow redirects | ||
| + | curl -sIL "$url" | grep -i "Content-Length" | tail -n 1 | awk '{print $2}' | tr -d '\r' | ||
| + | } | ||
| + | |||
| + | # get the filesize | ||
| + | # params | ||
| + | # 1: file - the name of the file | ||
| + | get_local_filesize() { | ||
| + | local file="$1" | ||
| + | stat -c%s "$file" | ||
| + | } | ||
| + | |||
| + | # create a filesize marker | ||
| + | # params: | ||
| + | # 1: file - the file to get the size for | ||
| + | mark_filesize() { | ||
| + | local file="$1" | ||
| + | if [ ! -f $file.size ] | ||
| + | then | ||
| + | local file_size=$(get_remote_filesize $BASE_URL/$file) | ||
| + | echo $file_size>$file.size | ||
| + | fi | ||
| + | cat $file.size | ||
| + | } | ||
| + | |||
| + | # calculate md5 hash and compare to expected - creates marker file if ok | ||
| + | # params: | ||
| + | # 1: filename | ||
| + | # 2: expected hash | ||
| + | md5_and_mark() { | ||
| + | local filename="$1" | ||
| + | local expected_filesize="$2" | ||
| + | #local expected_hash=$(grep $filename $MD5SUMS | cut -c1-32) | ||
| + | local expected_hash=$(awk -v fn="$filename" '$2 == fn {print $1}' $MD5SUMS) | ||
| + | local marker_file="${filename}.md5" | ||
| + | if [ -f $marker_file ] | ||
| + | then | ||
| + | success "past $filename size and hash " | ||
| + | return | ||
| + | fi | ||
| + | action "checking md5 for $filename of size $expected_filesize to be $expected_hash" | ||
| + | local filesize=$(get_local_filesize $filename) | ||
| + | if [ "$expected_filesize" != "$filesize" ] | ||
| + | then | ||
| + | error "size mismatch for $filename: expected $expected_filesize but is $filesize" | ||
| + | else | ||
| + | local hash=$(md5sum "${filename}" | awk '{print $1}') | ||
| + | if [ "$hash" != "$expected_hash" ] | ||
| + | then | ||
| + | error "hash mismatch for $filename: expected $expected_hash but is $hash" | ||
| + | else | ||
| + | echo $hash > $marker_file | ||
| + | success "$filename size and hash" | ||
| + | fi | ||
| + | fi | ||
| + | } | ||
| + | |||
| + | # get a single file | ||
| + | # params: | ||
| + | # 1: file - the file to download | ||
| + | get_file() { | ||
| + | local file="$1" | ||
| + | if [ ! -f $file ] | ||
| + | then | ||
| + | action "downloading $file" | ||
| + | wget $BASE_URL/$file | ||
| + | else | ||
| + | success "$file exists" | ||
| + | fi | ||
| + | } | ||
| + | |||
| + | # get a single file | ||
| + | # params: | ||
| + | # 1: file - the file to download | ||
| + | get_single_file() { | ||
| + | local file="$1" | ||
| + | local expected_filesize=$(mark_filesize "$file") | ||
| + | get_file "$file" | ||
| + | md5_and_mark "$file" $expected_filesize | ||
| + | } | ||
| + | |||
| + | |||
| + | # get the complete file list | ||
| + | file_list() { | ||
| + | # reverse | ||
| + | #cat $MD5SUMS | grep "\.0" | cut -c35- | sort -r | ||
| + | # forward | ||
| + | cat $MD5SUMS | grep "\.0" | cut -c35- | ||
| + | } | ||
| + | |||
| + | # download all files from the file list | ||
| + | download() { | ||
| + | for file in $(file_list) | ||
| + | do | ||
| + | get_single_file $file | ||
| + | done | ||
| + | } | ||
| + | |||
| + | # Add after the download() function, before CLI parsing: | ||
| + | |||
| + | # get the target filename from the first file in the list | ||
| + | get_target_filename() { | ||
| + | file_list | head -n 1 | sed 's/\.[0-9]\+$//' | ||
| + | } | ||
| + | |||
| + | # reassemble | ||
| + | recat() { | ||
| + | local target=$(get_target_filename) | ||
| + | action "reassembling downloads into $target" | ||
| + | local expected_filesize=$(mark_filesize "$target") | ||
| + | |||
| + | # verify all parts are downloaded and verified | ||
| + | local missing=0 | ||
| + | for file in $(file_list) | ||
| + | do | ||
| + | if [ ! -f "$file.md5" ] | ||
| + | then | ||
| + | warn "$file not verified yet" | ||
| + | missing=1 | ||
| + | fi | ||
| + | done | ||
| + | |||
| + | if [ $missing -eq 1 ] | ||
| + | then | ||
| + | error "not all parts verified - run --download first" | ||
| + | fi | ||
| + | |||
| + | # concatenate parts in order | ||
| + | if [ -f "$target" ] | ||
| + | then | ||
| + | warn "$target exists will not override" | ||
| + | else | ||
| + | pv -petrab $(file_list) > "$target" | ||
| + | fi | ||
| + | |||
| + | if [ ! -s "$target" ] | ||
| + | then | ||
| + | error "reassembly failed - target file empty" | ||
| + | fi | ||
| + | |||
| + | local assembled_size=$(get_local_filesize "$target") | ||
| + | success "reassembled $target ($assembled_size bytes)" | ||
| + | md5_and_mark "$target" $expected_filesize | ||
| + | } | ||
| + | |||
| + | # Parse command-line arguments | ||
| + | if [[ $# -eq 0 ]]; then | ||
| + | usage | ||
| + | fi | ||
| + | |||
| + | # main command-line-interface loop | ||
| + | while [[ $# -gt 0 ]]; do | ||
| + | case $1 in | ||
| + | -h|--help) usage ;; | ||
| + | -l|--list) | ||
| + | get_file $MD5SUMS | ||
| + | file_list | ||
| + | ;; | ||
| + | --download) | ||
| + | get_file $MD5SUMS | ||
| + | download | ||
| + | ;; | ||
| + | --single) | ||
| + | shift | ||
| + | if [[ $# -eq 0 ]]; then | ||
| + | error "single needs file number" | ||
| + | fi | ||
| + | number=$1 | ||
| + | file=$PREFIX$1 | ||
| + | get_single_file $file | ||
| + | ;; | ||
| + | --cat) | ||
| + | shift | ||
| + | get_file $MD5SUMS | ||
| + | recat | ||
| + | ;; | ||
| + | esac | ||
| + | shift | ||
| + | done | ||
| + | </source> | ||
Revision as of 21:10, 13 December 2025
Import
| Import | |
|---|---|
| state | |
| url | https://wiki.bitplan.com/index.php/Wikidata_Import_2025-12-13 |
| target | blazegraph |
| start | 2025-12-07 |
| end | 2025-12-13 |
| days | 6 |
| os | Ubuntu 22.04.5 LTS |
| cpu | AMD Ryzen 9 5900X 12-Core Processor |
| ram | 128 |
| triples | |
| comment | seeded with 1.3 TB data.jnl file originally provided by James Hare see https://phabricator.wikimedia.org/T403627 |
jnlget
#!/bin/bash
# download and reassemble wikidata blazegraph journal file
# see https://phabricator.wikimedia.org/T403627
# WF 2025-12-12
# Configuration
BASE_URL=https://files.scatter.red/orb/2025/12
PREFIX=2025-12-07-wikidata-data.jnl.gz.0000
MD5SUMS=md5sums.txt
# colored messages
HASH_MESSAGES_FILE="$HOME/source/bash/mp/bash_messages"
if [ -f "$HASH_MESSAGES_FILE" ]; then
# shellcheck disable=SC1090
source "$HASH_MESSAGES_FILE"
else
# Fallback
error() { echo "Error: $1" >&2; [ "${2:-1}" -ne 0 ] && exit "${2:-1}"; }
success() { echo "Success: $1"; }
action() { echo "Action: $1"; }
warn() { echo "Warning: $1"; }
fi
# show usage
usage() {
cat <<EOF
Usage $0 [OPTIONS]
Options:
-h, --help Show this help message
--download Download all files
-l, --list Show the file list
--single [number] Get a single file with the given number
--cat reassemble the original file
EOF
exit 1
}
# Gets the Content-Length of a remote file
# params
# 1: url - file size of the given url
get_remote_filesize() {
local url="$1"
# -s: Silent
# -I: Header only
# -L: Follow redirects
curl -sIL "$url" | grep -i "Content-Length" | tail -n 1 | awk '{print $2}' | tr -d '\r'
}
# get the filesize
# params
# 1: file - the name of the file
get_local_filesize() {
local file="$1"
stat -c%s "$file"
}
# create a filesize marker
# params:
# 1: file - the file to get the size for
mark_filesize() {
local file="$1"
if [ ! -f $file.size ]
then
local file_size=$(get_remote_filesize $BASE_URL/$file)
echo $file_size>$file.size
fi
cat $file.size
}
# calculate md5 hash and compare to expected - creates marker file if ok
# params:
# 1: filename
# 2: expected hash
md5_and_mark() {
local filename="$1"
local expected_filesize="$2"
#local expected_hash=$(grep $filename $MD5SUMS | cut -c1-32)
local expected_hash=$(awk -v fn="$filename" '$2 == fn {print $1}' $MD5SUMS)
local marker_file="${filename}.md5"
if [ -f $marker_file ]
then
success "past $filename size and hash "
return
fi
action "checking md5 for $filename of size $expected_filesize to be $expected_hash"
local filesize=$(get_local_filesize $filename)
if [ "$expected_filesize" != "$filesize" ]
then
error "size mismatch for $filename: expected $expected_filesize but is $filesize"
else
local hash=$(md5sum "${filename}" | awk '{print $1}')
if [ "$hash" != "$expected_hash" ]
then
error "hash mismatch for $filename: expected $expected_hash but is $hash"
else
echo $hash > $marker_file
success "$filename size and hash"
fi
fi
}
# get a single file
# params:
# 1: file - the file to download
get_file() {
local file="$1"
if [ ! -f $file ]
then
action "downloading $file"
wget $BASE_URL/$file
else
success "$file exists"
fi
}
# get a single file
# params:
# 1: file - the file to download
get_single_file() {
local file="$1"
local expected_filesize=$(mark_filesize "$file")
get_file "$file"
md5_and_mark "$file" $expected_filesize
}
# get the complete file list
file_list() {
# reverse
#cat $MD5SUMS | grep "\.0" | cut -c35- | sort -r
# forward
cat $MD5SUMS | grep "\.0" | cut -c35-
}
# download all files from the file list
download() {
for file in $(file_list)
do
get_single_file $file
done
}
# Add after the download() function, before CLI parsing:
# get the target filename from the first file in the list
get_target_filename() {
file_list | head -n 1 | sed 's/\.[0-9]\+$//'
}
# reassemble
recat() {
local target=$(get_target_filename)
action "reassembling downloads into $target"
local expected_filesize=$(mark_filesize "$target")
# verify all parts are downloaded and verified
local missing=0
for file in $(file_list)
do
if [ ! -f "$file.md5" ]
then
warn "$file not verified yet"
missing=1
fi
done
if [ $missing -eq 1 ]
then
error "not all parts verified - run --download first"
fi
# concatenate parts in order
if [ -f "$target" ]
then
warn "$target exists will not override"
else
pv -petrab $(file_list) > "$target"
fi
if [ ! -s "$target" ]
then
error "reassembly failed - target file empty"
fi
local assembled_size=$(get_local_filesize "$target")
success "reassembled $target ($assembled_size bytes)"
md5_and_mark "$target" $expected_filesize
}
# Parse command-line arguments
if [[ $# -eq 0 ]]; then
usage
fi
# main command-line-interface loop
while [[ $# -gt 0 ]]; do
case $1 in
-h|--help) usage ;;
-l|--list)
get_file $MD5SUMS
file_list
;;
--download)
get_file $MD5SUMS
download
;;
--single)
shift
if [[ $# -eq 0 ]]; then
error "single needs file number"
fi
number=$1
file=$PREFIX$1
get_single_file $file
;;
--cat)
shift
get_file $MD5SUMS
recat
;;
esac
shift
done