WikiData Import 2020-08-15

From BITPlan Wiki
Jump to navigation Jump to search

Environment

  1. Mac Pro Mid 2010
  2. 12 core 3.46 GHz
  3. 64 GB RAM
  4. macOS High Sierra 10.13.6
  5. Source Disk: 4 TB 7200 rpm hard disk WD Gold WDC WD4002FYYZ Blackmagic speed rating: 175 MB/s write 175 MB/s read
  6. Target Disk: 4 TB SSD Samsung 860 EVO Blackmagic speed rating: 257 MB/s write 270 MB/s read
  7. java -version
    openjdk version "11.0.5" 2019-10-15
    OpenJDK Runtime Environment AdoptOpenJDK (build 11.0.5+10)
    OpenJDK 64-Bit Server VM AdoptOpenJDK (build 11.0.5+10, mixed mode)
    

Summary

Download and unpack

Downloading took some 7 h unpacking took some 12h 30

date;wget https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2;date
Sat Aug 15 17:47:23 CEST 2020
--2020-08-15 17:47:24--  https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620::861:1:208:80:154:7, 208.80.154.7
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620::861:1:208:80:154:7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 119725690213 (112G) [application/octet-stream]
Saving to: ‘latest-all.nt.bz2’
latest-all.nt.bz2           0%[                                    ]  34.15M  4.61MB/s    eta 6h 58m 
...
latest-all.nt.bz2         100%[===================================>] 111.50G  4.88MB/s    in  6h 49m  
2020-08-16 00:37:06 (4.64 MB/s) - ‘latest-all.nt.bz2’ saved [119725690213/119725690213]
Sun Aug 16 00:37:06 CEST 2020
date;bzip2 -dk latest-all.nt.bz2;date
Sun Aug 16 07:02:30 CEST 2020
Sun Aug 16 19:37:34 CEST 2020
ls -l latest-all.nt
-rw-r--r--  1 wf  admin  2180458524345 Aug 13 20:22 latest-all.nt

Start and progress

nohup ./wikidata2jena&
tail -f tdb2-err.log 
19:54:45 INFO  loader          :: Loader = LoaderPhased
19:54:45 INFO  loader          :: Start: latest-all.nt
19:54:50 INFO  loader          :: Add: 500.000 latest-all.nt (Batch: 95.474 / Avg: 95.474)
19:54:53 INFO  loader          :: Add: 1.000.000 latest-all.nt (Batch: 186.846 / Avg: 126.374)
19:54:55 INFO  loader          :: Add: 1.500.000 latest-all.nt (Batch: 207.986 / Avg: 145.391)
19:54:58 INFO  loader          :: Add: 2.000.000 latest-all.nt (Batch: 166.500 / Avg: 150.150)
19:55:03 INFO  loader          :: Add: 2.500.000 latest-all.nt (Batch: 100.603 / Avg: 136.686)
19:55:08 INFO  loader          :: Add: 3.000.000 latest-all.nt (Batch: 100.745 / Avg: 129.015)
19:55:13 INFO  loader          :: Add: 3.500.000 latest-all.nt (Batch: 99.226 / Avg: 123.709)
19:55:18 INFO  loader          :: Add: 4.000.000 latest-all.nt (Batch: 99.383 / Avg: 120.037)
19:55:24 INFO  loader          :: Add: 4.500.000 latest-all.nt (Batch: 94.589 / Avg: 116.553)
19:55:29 INFO  loader          :: Add: 5.000.000 latest-all.nt (Batch: 89.269 / Avg: 113.096)
19:55:29 INFO  loader          ::   Elapsed: 44,21 seconds [2020/08/16 19:55:29 MESZ]
19:55:35 INFO  loader          :: Add: 5.500.000 latest-all.nt (Batch: 93.861 / Avg: 111.028)
...
22:02:30 INFO  loader          :: Add: 1.000.000.000 latest-all.nt (Batch: 103.199 / Avg: 130.468)
22:02:30 INFO  loader          ::   Elapsed: 7.664,71 seconds [2020/08/16 22:02:30 MESZ]
...
00:30:49 INFO  loader          :: Add: 2.000.000.000 latest-all.nt (Batch: 117.647 / Avg: 120.741)
00:30:49 INFO  loader          ::   Elapsed: 16.564,27 seconds [2020/08/17 00:30:49 MESZ]
--
03:00:51 INFO  loader          :: Add: 3.000.000.000 latest-all.nt (Batch: 112.688 / Avg: 117.341)
03:00:51 INFO  loader          ::   Elapsed: 25.566,39 seconds [2020/08/17 03:00:51 MESZ]
--
06:33:58 INFO  loader          :: Add: 4.000.000.000 latest-all.nt (Batch: 41.569 / Avg: 104.293)
06:33:58 INFO  loader          ::   Elapsed: 38.353,15 seconds [2020/08/17 06:33:58 MESZ]

Scripts

wikidat2jena

#!/bin/bash
# WF 2020-05-10

# global settings
jena=apache-jena-3.16.0
tgz=$jena.tar.gz
jenaurl=http://mirror.easyname.ch/apache/jena/binaries/$tgz
base=/Volumes/Torterra/wikidata2020-08-15
data=$base/data
tdbloader=$jena/bin/tdb2.tdbloader

getjena() {
# download
if [ ! -f $tgz ]
then
  echo "downloading $tgz from $jenaurl"
	wget $jenaurl
else
  echo "$tgz already downloaded"
fi
# unpack
if [ ! -d $jena ]
then
  echo "unpacking $jena from $tgz"
	tar xvzf $tgz
else
  echo "$jena already unpacked"
fi
# create data directory
if [ ! -d $data ]
then
  echo "creating $data directory"
  mkdir -p $data
else
  echo "$data directory already created"
fi
}

#
# show the given timestamp
#
timestamp() {
 local msg="$1"
 local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
 echo "$msg at $ts"
}

#
# load data for the given data dir and input
#
loaddata() {
	local data="$1"
	local input="$2"
  timestamp "start loading $input to $data"
  $tdbloader --loc "$data" "$input" > tdb2-out.log 2> tdb2-err.log
	timestamp "finished loading $input to $data"
}

getjena
export TMPDIR=$base/tmp
if [ ! -d $TMPDIR ]
then
  echo "creating temporary directory $TMPDIR"
  mkdir $TMPDIR
else
  echo "using temporary directory $TMPDIR"
fi
loaddata $data latest-all.nt

check extract speed

./speed 
149 MB in 3 s =  49 MB/s = 172 GB/h
315/2014 GB (15 %) todo: 1699 GB
ETA 9.9 h

speed

#!/bin/bash
# WF 2020-08-16
# check extract speed

#
# check the speed of extraction
#
checkSpeed() {
  local l_secs="$1"
	local l_total="$2"
	totalgb=$(expr $l_total / 1024 / 1024 / 1024)
  first=$(du -sm latest-all.nt | cut -f1)
  sleep $l_secs 
  second=$(du -sm latest-all.nt | cut -f1)
  mbytes=$(expr $second - $first)
	mpersec=$(expr $mbytes / $l_secs)
	gperh=$(expr $mpersec \* 3600 / 1024)
	echo "$mbytes MB in $l_secs s =  $mpersec MB/s = $gperh GB/h"
	donegb=$(expr $second / 1024)
	todogb=$(expr $totalgb - $donegb)
	percent=$(expr $donegb \* 100 / $totalgb)
	eta=$(expr $todogb / $gperh)
	echo "$donegb/$totalgb GB ($percent %) todo: $todogb GB"
	awk -v todo=$todogb -v gperh=$gperh 'BEGIN { printf("ETA %.1f h\n",todo/gperh) }'
}

checkSpeed 3 2162713035569