WikiData Import 2020-08-15

From BITPlan Wiki
Revision as of 06:51, 17 August 2020 by Wf (talk | contribs) (→‎Summary)
Jump to navigation Jump to search

Environment

  1. Mac Pro Mid 2010
  2. 12 core 3.46 GHz
  3. 64 GB RAM
  4. macOS High Sierra 10.13.6
  5. Source Disk: 4 TB 7200 rpm hard disk WD Gold WDC WD4002FYYZ Blackmagic speed rating: 175 MB/s write 175 MB/s read
  6. Target Disk: 4 TB SSD Samsung 860 EVO Blackmagic speed rating: 257 MB/s write 270 MB/s read
  7. java -version
    openjdk version "11.0.5" 2019-10-15
    OpenJDK Runtime Environment AdoptOpenJDK (build 11.0.5+10)
    OpenJDK 64-Bit Server VM AdoptOpenJDK (build 11.0.5+10, mixed mode)
    

Summary

  1. trying to replicate success story of https://issues.apache.org/jira/projects/JENA/issues/JENA-1909
  2. download of 111 GB took some 7 h
  3. unzipping to some 2030 GB took some 12 h 30
  4. phased import of some 13.8 billion lines is at 4.2 billion after half a day

Download and unpack

Downloading took some 7 h unpacking took some 12h 30

date;wget https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2;date
Sat Aug 15 17:47:23 CEST 2020
--2020-08-15 17:47:24--  https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620::861:1:208:80:154:7, 208.80.154.7
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620::861:1:208:80:154:7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 119725690213 (112G) [application/octet-stream]
Saving to: ‘latest-all.nt.bz2’
latest-all.nt.bz2           0%[                                    ]  34.15M  4.61MB/s    eta 6h 58m 
...
latest-all.nt.bz2         100%[===================================>] 111.50G  4.88MB/s    in  6h 49m  
2020-08-16 00:37:06 (4.64 MB/s) - ‘latest-all.nt.bz2’ saved [119725690213/119725690213]
Sun Aug 16 00:37:06 CEST 2020
date;bzip2 -dk latest-all.nt.bz2;date
Sun Aug 16 07:02:30 CEST 2020
Sun Aug 16 19:37:34 CEST 2020
ls -l latest-all.nt
-rw-r--r--  1 wf  admin  2180458524345 Aug 13 20:22 latest-all.nt

Start and progress

egrep "000\.000\.000" -A1  tdb2-err.log
nohup ./wikidata2jena&
tail -f tdb2-err.log 
19:54:45 INFO  loader          :: Loader = LoaderPhased
19:54:45 INFO  loader          :: Start: latest-all.nt
19:54:50 INFO  loader          :: Add: 500.000 latest-all.nt (Batch: 95.474 / Avg: 95.474)
19:54:53 INFO  loader          :: Add: 1.000.000 latest-all.nt (Batch: 186.846 / Avg: 126.374)
19:54:55 INFO  loader          :: Add: 1.500.000 latest-all.nt (Batch: 207.986 / Avg: 145.391)
19:54:58 INFO  loader          :: Add: 2.000.000 latest-all.nt (Batch: 166.500 / Avg: 150.150)
19:55:03 INFO  loader          :: Add: 2.500.000 latest-all.nt (Batch: 100.603 / Avg: 136.686)
19:55:08 INFO  loader          :: Add: 3.000.000 latest-all.nt (Batch: 100.745 / Avg: 129.015)
19:55:13 INFO  loader          :: Add: 3.500.000 latest-all.nt (Batch: 99.226 / Avg: 123.709)
19:55:18 INFO  loader          :: Add: 4.000.000 latest-all.nt (Batch: 99.383 / Avg: 120.037)
19:55:24 INFO  loader          :: Add: 4.500.000 latest-all.nt (Batch: 94.589 / Avg: 116.553)
19:55:29 INFO  loader          :: Add: 5.000.000 latest-all.nt (Batch: 89.269 / Avg: 113.096)
19:55:29 INFO  loader          ::   Elapsed: 44,21 seconds [2020/08/16 19:55:29 MESZ]
19:55:35 INFO  loader          :: Add: 5.500.000 latest-all.nt (Batch: 93.861 / Avg: 111.028)
...
22:02:30 INFO  loader          :: Add: 1.000.000.000 latest-all.nt (Batch: 103.199 / Avg: 130.468)
22:02:30 INFO  loader          ::   Elapsed: 7.664,71 seconds [2020/08/16 22:02:30 MESZ]
...
00:30:49 INFO  loader          :: Add: 2.000.000.000 latest-all.nt (Batch: 117.647 / Avg: 120.741)
00:30:49 INFO  loader          ::   Elapsed: 16.564,27 seconds [2020/08/17 00:30:49 MESZ]
--
03:00:51 INFO  loader          :: Add: 3.000.000.000 latest-all.nt (Batch: 112.688 / Avg: 117.341)
03:00:51 INFO  loader          ::   Elapsed: 25.566,39 seconds [2020/08/17 03:00:51 MESZ]
--
06:33:58 INFO  loader          :: Add: 4.000.000.000 latest-all.nt (Batch: 41.569 / Avg: 104.293)
06:33:58 INFO  loader          ::   Elapsed: 38.353,15 seconds [2020/08/17 06:33:58 MESZ]
...
07:45:46 INFO  loader          :: Add: 4.120.000.000 latest-all.nt (Batch: 17.885 / Avg: 96.575)
07:45:46 INFO  loader          ::   Elapsed: 42.661,10 seconds [2020/08/17 07:45:46 MESZ]

Scripts

wikidat2jena

#!/bin/bash
# WF 2020-05-10

# global settings
jena=apache-jena-3.16.0
tgz=$jena.tar.gz
jenaurl=http://mirror.easyname.ch/apache/jena/binaries/$tgz
base=/Volumes/Torterra/wikidata2020-08-15
data=$base/data
tdbloader=$jena/bin/tdb2.tdbloader

getjena() {
# download
if [ ! -f $tgz ]
then
  echo "downloading $tgz from $jenaurl"
	wget $jenaurl
else
  echo "$tgz already downloaded"
fi
# unpack
if [ ! -d $jena ]
then
  echo "unpacking $jena from $tgz"
	tar xvzf $tgz
else
  echo "$jena already unpacked"
fi
# create data directory
if [ ! -d $data ]
then
  echo "creating $data directory"
  mkdir -p $data
else
  echo "$data directory already created"
fi
}

#
# show the given timestamp
#
timestamp() {
 local msg="$1"
 local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
 echo "$msg at $ts"
}

#
# load data for the given data dir and input
#
loaddata() {
	local data="$1"
	local input="$2"
  timestamp "start loading $input to $data"
  $tdbloader --loc "$data" "$input" > tdb2-out.log 2> tdb2-err.log
	timestamp "finished loading $input to $data"
}

getjena
export TMPDIR=$base/tmp
if [ ! -d $TMPDIR ]
then
  echo "creating temporary directory $TMPDIR"
  mkdir $TMPDIR
else
  echo "using temporary directory $TMPDIR"
fi
loaddata $data latest-all.nt

check extract speed

./speed 
149 MB in 3 s =  49 MB/s = 172 GB/h
315/2014 GB (15 %) todo: 1699 GB
ETA 9.9 h

speed

#!/bin/bash
# WF 2020-08-16
# check extract speed

#
# check the speed of extraction
#
checkSpeed() {
  local l_secs="$1"
	local l_total="$2"
	totalgb=$(expr $l_total / 1024 / 1024 / 1024)
  first=$(du -sm latest-all.nt | cut -f1)
  sleep $l_secs 
  second=$(du -sm latest-all.nt | cut -f1)
  mbytes=$(expr $second - $first)
	mpersec=$(expr $mbytes / $l_secs)
	gperh=$(expr $mpersec \* 3600 / 1024)
	echo "$mbytes MB in $l_secs s =  $mpersec MB/s = $gperh GB/h"
	donegb=$(expr $second / 1024)
	todogb=$(expr $totalgb - $donegb)
	percent=$(expr $donegb \* 100 / $totalgb)
	eta=$(expr $todogb / $gperh)
	echo "$donegb/$totalgb GB ($percent %) todo: $todogb GB"
	awk -v todo=$todogb -v gperh=$gperh 'BEGIN { printf("ETA %.1f h\n",todo/gperh) }'
}

checkSpeed 3 2162713035569