Difference between revisions of "Wikidata Import 2025-12-13"

From BITPlan Wiki
Jump to navigation Jump to search
Line 259: Line 259:
 
     --cat            reassemble the original file
 
     --cat            reassemble the original file
 
</source>
 
</source>
=== stepwise download ==
+
=== stepwise download ===
 
You might want to try what amount of parallel downloads works for you. We had mixed results
 
You might want to try what amount of parallel downloads works for you. We had mixed results
 
from less than 1 MB / sec to 35 MB/sec on the 10 GBit line on the RWTH Aachen server.
 
from less than 1 MB / sec to 35 MB/sec on the 10 GBit line on the RWTH Aachen server.

Revision as of 21:20, 13 December 2025

Import

Import
edit
state  
url  https://wiki.bitplan.com/index.php/Wikidata_Import_2025-12-13
target  blazegraph
start  2025-12-07
end  2025-12-13
days  6
os  Ubuntu 22.04.5 LTS
cpu  AMD Ryzen 9 5900X 12-Core Processor
ram  128
triples  
comment  seeded with 1.3 TB data.jnl file originally provided by James Hare see https://phabricator.wikimedia.org/T403627

see also Wikidata_Import_2025-06-06

jnlget

#!/bin/bash
# download and reassemble wikidata blazegraph journal file 
# see https://phabricator.wikimedia.org/T403627
# WF 2025-12-12

# Configuration
BASE_URL=https://files.scatter.red/orb/2025/12
PREFIX=2025-12-07-wikidata-data.jnl.gz.0000
MD5SUMS=md5sums.txt

# colored messages
HASH_MESSAGES_FILE="$HOME/source/bash/mp/bash_messages"

if [ -f "$HASH_MESSAGES_FILE" ]; then
  # shellcheck disable=SC1090
  source "$HASH_MESSAGES_FILE"
else
  # Fallback
  error() { echo "Error: $1" >&2; [ "${2:-1}" -ne 0 ] && exit "${2:-1}"; }
  success() { echo "Success: $1"; }
  action() { echo "Action: $1"; }
  warn() { echo "Warning: $1"; }
fi

# show usage
usage() { 
  cat <<EOF 
Usage $0 [OPTIONS]
  Options:
    -h, --help        Show this help message 
    --download        Download all files
    -l, --list        Show the file list 
    --single [number] Get a single file with the given number
    --cat             reassemble the original file
EOF
  exit 1
}



# Gets the Content-Length of a remote file 
# params
#  1: url - file size of the given url
get_remote_filesize() {
    local url="$1"
    # -s: Silent
    # -I: Header only
    # -L: Follow redirects
    curl -sIL "$url" | grep -i "Content-Length" | tail -n 1 | awk '{print $2}' | tr -d '\r'
}

# get the filesize
# params
#  1: file - the name of the file
get_local_filesize() {
   local file="$1"
   stat -c%s "$file"
}

# create a filesize marker
# params:
#   1: file - the file  to get the size for
mark_filesize() {
  local file="$1"
  if [ ! -f $file.size ]
  then
    local file_size=$(get_remote_filesize $BASE_URL/$file)
    echo $file_size>$file.size
  fi
  cat $file.size
}

# calculate md5 hash and compare to expected - creates marker file if ok
# params:
#   1: filename
#   2: expected hash
md5_and_mark() {
  local filename="$1"
  local expected_filesize="$2"
  #local expected_hash=$(grep $filename $MD5SUMS | cut -c1-32)
  local expected_hash=$(awk -v fn="$filename" '$2 == fn {print $1}' $MD5SUMS)
  local marker_file="${filename}.md5"
  if [ -f $marker_file ]
  then
    success "past $filename size and hash "
    return
  fi
  action "checking md5 for $filename of size $expected_filesize to be $expected_hash"
  local filesize=$(get_local_filesize $filename)
  if [ "$expected_filesize" != "$filesize" ] 
  then
     error "size mismatch for $filename: expected $expected_filesize but is $filesize"
  else
     local hash=$(md5sum "${filename}" | awk '{print $1}')
     if [ "$hash" != "$expected_hash" ] 
     then
        error "hash mismatch for $filename: expected $expected_hash but is $hash"
     else
	echo $hash > $marker_file
	success "$filename size and hash"
     fi
  fi
}

# get a single file
# params:
#   1: file - the file to download
get_file() {
  local file="$1"
  if [ ! -f $file ]
  then
    action "downloading $file"
    wget $BASE_URL/$file
  else
    success "$file exists"
  fi
}

# get a single file
# params:
#   1: file - the file to download
get_single_file() {
  local file="$1"
  local expected_filesize=$(mark_filesize "$file")
  get_file "$file"
  md5_and_mark "$file" $expected_filesize
}


# get the complete file list
file_list() {
 # reverse
 #cat $MD5SUMS | grep "\.0" | cut -c35- | sort -r
 # forward
 cat $MD5SUMS | grep "\.0" | cut -c35- 
}

# download all files from the file list
download() {
  for file in $(file_list)
  do
    get_single_file $file
  done
} 

# Add after the download() function, before CLI parsing:

# get the target filename from the first file in the list
get_target_filename() {
  file_list | head -n 1 | sed 's/\.[0-9]\+$//'
}

# reassemble
recat() {
  local target=$(get_target_filename)
  action "reassembling downloads into $target"
  local expected_filesize=$(mark_filesize "$target")

  # verify all parts are downloaded and verified
  local missing=0
  for file in $(file_list)
  do
    if [ ! -f "$file.md5" ]
    then
      warn "$file not verified yet"
      missing=1
    fi
  done

  if [ $missing -eq 1 ]
  then
    error "not all parts verified - run --download first"
  fi

  # concatenate parts in order
  if [ -f "$target" ]
  then
    warn "$target exists will not override"
  else
    pv -petrab $(file_list) > "$target"
  fi

  if [ ! -s "$target" ]
  then
    error "reassembly failed - target file empty"
  fi

  local assembled_size=$(get_local_filesize "$target")
  success "reassembled $target ($assembled_size bytes)"
  md5_and_mark "$target" $expected_filesize
}

# Parse command-line arguments 
if [[ $# -eq 0 ]]; then
  usage
fi

# main command-line-interface loop
while [[ $# -gt 0 ]]; do
  case $1 in
    -h|--help) usage ;;
    -l|--list) 
      get_file $MD5SUMS
      file_list
      ;;
    --download) 
      get_file $MD5SUMS
      download
      ;;
    --single)
      shift
      if [[ $# -eq 0 ]]; then
	error "single needs file number"
      fi
      number=$1
      file=$PREFIX$1
      get_single_file $file
      ;;
    --cat)
      shift
      get_file $MD5SUMS
      recat 
    ;;
  esac
  shift
done

usage

./jnlget -h
Usage ./jnlget [OPTIONS]
  Options:
    -h, --help        Show this help message 
    --download        Download all files
    -l, --list        Show the file list 
    --single [number] Get a single file with the given number
    --cat             reassemble the original file

stepwise download

You might want to try what amount of parallel downloads works for you. We had mixed results from less than 1 MB / sec to 35 MB/sec on the 10 GBit line on the RWTH Aachen server.

for i in {00..19}; do nohup ./jnlget --single $i > jnlget_$i.out 2>&1 & done
for i in {20..39}; do nohup ./jnlget --single $i > jnlget_$i.out 2>&1 & done
for i in {40..59}; do nohup ./jnlget --single $i > jnlget_$i.out 2>&1 & done
for i in {60..81}; do nohup ./jnlget --single $i > jnlget_$i.out 2>&1 & done

reassemble

./jnlget --cat
✓ md5sums.txt exists
➜ reassembling downloads into 2025-12-07-wikidata-data.jnl.gz
⚠ 2025-12-07-wikidata-data.jnl.gz exists will not override
✓ reassembled 2025-12-07-wikidata-data.jnl.gz (438103694405 bytes)
➜ checking md5 for 2025-12-07-wikidata-data.jnl.gz of size 438103694405 to be ad0006a38103efd715c782a539a6f482
✓ 2025-12-07-wikidata-data.jnl.gz size and hash