Revision as of 21:20, 13 December 2025

Import

Import
edit
state
url	https://wiki.bitplan.com/index.php/Wikidata_Import_2025-12-13
target	blazegraph
start	2025-12-07
end	2025-12-13
days	6
os	Ubuntu 22.04.5 LTS
cpu	AMD Ryzen 9 5900X 12-Core Processor
ram	128
triples
comment	seeded with 1.3 TB data.jnl file originally provided by James Hare see https://phabricator.wikimedia.org/T403627

jnlget

#!/bin/bash
# download and reassemble wikidata blazegraph journal file 
# see https://phabricator.wikimedia.org/T403627
# WF 2025-12-12

# Configuration
BASE_URL=https://files.scatter.red/orb/2025/12
PREFIX=2025-12-07-wikidata-data.jnl.gz.0000
MD5SUMS=md5sums.txt

# colored messages
HASH_MESSAGES_FILE="$HOME/source/bash/mp/bash_messages"

if [ -f "$HASH_MESSAGES_FILE" ]; then
  # shellcheck disable=SC1090
  source "$HASH_MESSAGES_FILE"
else
  # Fallback
  error() { echo "Error: $1" >&2; [ "${2:-1}" -ne 0 ] && exit "${2:-1}"; }
  success() { echo "Success: $1"; }
  action() { echo "Action: $1"; }
  warn() { echo "Warning: $1"; }
fi

# show usage
usage() { 
  cat <<EOF 
Usage $0 [OPTIONS]
  Options:
    -h, --help        Show this help message 
    --download        Download all files
    -l, --list        Show the file list 
    --single [number] Get a single file with the given number
    --cat             reassemble the original file
EOF
  exit 1
}



# Gets the Content-Length of a remote file 
# params
#  1: url - file size of the given url
get_remote_filesize() {
    local url="$1"
    # -s: Silent
    # -I: Header only
    # -L: Follow redirects
    curl -sIL "$url" | grep -i "Content-Length" | tail -n 1 | awk '{print $2}' | tr -d '\r'
}

# get the filesize
# params
#  1: file - the name of the file
get_local_filesize() {
   local file="$1"
   stat -c%s "$file"
}

# create a filesize marker
# params:
#   1: file - the file  to get the size for
mark_filesize() {
  local file="$1"
  if [ ! -f $file.size ]
  then
    local file_size=$(get_remote_filesize $BASE_URL/$file)
    echo $file_size>$file.size
  fi
  cat $file.size
}

# calculate md5 hash and compare to expected - creates marker file if ok
# params:
#   1: filename
#   2: expected hash
md5_and_mark() {
  local filename="$1"
  local expected_filesize="$2"
  #local expected_hash=$(grep $filename $MD5SUMS | cut -c1-32)
  local expected_hash=$(awk -v fn="$filename" '$2 == fn {print $1}' $MD5SUMS)
  local marker_file="${filename}.md5"
  if [ -f $marker_file ]
  then
    success "past $filename size and hash "
    return
  fi
  action "checking md5 for $filename of size $expected_filesize to be $expected_hash"
  local filesize=$(get_local_filesize $filename)
  if [ "$expected_filesize" != "$filesize" ] 
  then
     error "size mismatch for $filename: expected $expected_filesize but is $filesize"
  else
     local hash=$(md5sum "${filename}" | awk '{print $1}')
     if [ "$hash" != "$expected_hash" ] 
     then
        error "hash mismatch for $filename: expected $expected_hash but is $hash"
     else
	echo $hash > $marker_file
	success "$filename size and hash"
     fi
  fi
}

# get a single file
# params:
#   1: file - the file to download
get_file() {
  local file="$1"
  if [ ! -f $file ]
  then
    action "downloading $file"
    wget $BASE_URL/$file
  else
    success "$file exists"
  fi
}

# get a single file
# params:
#   1: file - the file to download
get_single_file() {
  local file="$1"
  local expected_filesize=$(mark_filesize "$file")
  get_file "$file"
  md5_and_mark "$file" $expected_filesize
}


# get the complete file list
file_list() {
 # reverse
 #cat $MD5SUMS | grep "\.0" | cut -c35- | sort -r
 # forward
 cat $MD5SUMS | grep "\.0" | cut -c35- 
}

# download all files from the file list
download() {
  for file in $(file_list)
  do
    get_single_file $file
  done
} 

# Add after the download() function, before CLI parsing:

# get the target filename from the first file in the list
get_target_filename() {
  file_list | head -n 1 | sed 's/\.[0-9]\+$//'
}

# reassemble
recat() {
  local target=$(get_target_filename)
  action "reassembling downloads into $target"
  local expected_filesize=$(mark_filesize "$target")

  # verify all parts are downloaded and verified
  local missing=0
  for file in $(file_list)
  do
    if [ ! -f "$file.md5" ]
    then
      warn "$file not verified yet"
      missing=1
    fi
  done

  if [ $missing -eq 1 ]
  then
    error "not all parts verified - run --download first"
  fi

  # concatenate parts in order
  if [ -f "$target" ]
  then
    warn "$target exists will not override"
  else
    pv -petrab $(file_list) > "$target"
  fi

  if [ ! -s "$target" ]
  then
    error "reassembly failed - target file empty"
  fi

  local assembled_size=$(get_local_filesize "$target")
  success "reassembled $target ($assembled_size bytes)"
  md5_and_mark "$target" $expected_filesize
}

# Parse command-line arguments 
if [[ $# -eq 0 ]]; then
  usage
fi

# main command-line-interface loop
while [[ $# -gt 0 ]]; do
  case $1 in
    -h|--help) usage ;;
    -l|--list) 
      get_file $MD5SUMS
      file_list
      ;;
    --download) 
      get_file $MD5SUMS
      download
      ;;
    --single)
      shift
      if [[ $# -eq 0 ]]; then
	error "single needs file number"
      fi
      number=$1
      file=$PREFIX$1
      get_single_file $file
      ;;
    --cat)
      shift
      get_file $MD5SUMS
      recat 
    ;;
  esac
  shift
done

usage

./jnlget -h
Usage ./jnlget [OPTIONS]
  Options:
    -h, --help        Show this help message 
    --download        Download all files
    -l, --list        Show the file list 
    --single [number] Get a single file with the given number
    --cat             reassemble the original file

stepwise download

You might want to try what amount of parallel downloads works for you. We had mixed results from less than 1 MB / sec to 35 MB/sec on the 10 GBit line on the RWTH Aachen server.

for i in {00..19}; do nohup ./jnlget --single $i > jnlget_$i.out 2>&1 & done
for i in {20..39}; do nohup ./jnlget --single $i > jnlget_$i.out 2>&1 & done
for i in {40..59}; do nohup ./jnlget --single $i > jnlget_$i.out 2>&1 & done
for i in {60..81}; do nohup ./jnlget --single $i > jnlget_$i.out 2>&1 & done

reassemble

./jnlget --cat
✓ md5sums.txt exists
➜ reassembling downloads into 2025-12-07-wikidata-data.jnl.gz
⚠ 2025-12-07-wikidata-data.jnl.gz exists will not override
✓ reassembled 2025-12-07-wikidata-data.jnl.gz (438103694405 bytes)
➜ checking md5 for 2025-12-07-wikidata-data.jnl.gz of size 438103694405 to be ad0006a38103efd715c782a539a6f482
✓ 2025-12-07-wikidata-data.jnl.gz size and hash

@@ Line 259: / Line 259: @@
      --cat             reassemble the original file
 </source>
-=== stepwise download ==
+=== stepwise download ===
 You might want to try what amount of parallel downloads works for you. We had mixed results
 from less than 1 MB / sec to 35 MB/sec on the 10 GBit line on the RWTH Aachen server.

Difference between revisions of "Wikidata Import 2025-12-13"

Revision as of 21:20, 13 December 2025

Contents

Import

jnlget

usage

stepwise download

reassemble

Navigation menu

Search