WikiData Import 2020-08-15

From BITPlan Wiki
Jump to navigation Jump to search

see also Get_your_own_copy_of_WikiData - this was the seventh attempt after five failures and one success with a truthy datset. The main success factor was to use a 4 TB SSD disk which was kindly supplied by the ConfIDent project

Environment

  1. Mac Pro Mid 2010
  2. 12 core 3.46 GHz
  3. 64 GB RAM
  4. macOS High Sierra 10.13.6
  5. Source Disk: 4 TB 7200 rpm hard disk WD Gold WDC WD4002FYYZ Blackmagic speed rating: 175 MB/s write 175 MB/s read
  6. Target Disk: 4 TB SSD Samsung 860 EVO Blackmagic speed rating: 257 MB/s write 270 MB/s read
  7. java -version
    openjdk version "11.0.5" 2019-10-15
    OpenJDK Runtime Environment AdoptOpenJDK (build 11.0.5+10)
    OpenJDK 64-Bit Server VM AdoptOpenJDK (build 11.0.5+10, mixed mode)
    

Summary

  1. trying to replicate success story of https://issues.apache.org/jira/projects/JENA/issues/JENA-1909
  2. download of 111 GB took some 7 h
  3. unzipping to some 2030 GB took some 12 h 30
  4. counting 13.9 billion lines took some 3 h 20
  5. phased import of some 13.9 billion lines is in loading phase 2 (index) at some 13.6 billion lines after some 10 days phase 1 loading took 4.5 days phase 2 indexing is at day 5.5

Download and unpack

Downloading took some 7 h unpacking took some 12h 30

date;wget https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2;date
Sat Aug 15 17:47:23 CEST 2020
--2020-08-15 17:47:24--  https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620::861:1:208:80:154:7, 208.80.154.7
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620::861:1:208:80:154:7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 119725690213 (112G) [application/octet-stream]
Saving to: ‘latest-all.nt.bz2’
latest-all.nt.bz2           0%[                                    ]  34.15M  4.61MB/s    eta 6h 58m 
...
latest-all.nt.bz2         100%[===================================>] 111.50G  4.88MB/s    in  6h 49m  
2020-08-16 00:37:06 (4.64 MB/s) - ‘latest-all.nt.bz2’ saved [119725690213/119725690213]
Sun Aug 16 00:37:06 CEST 2020
date;bzip2 -dk latest-all.nt.bz2;date
Sun Aug 16 07:02:30 CEST 2020
Sun Aug 16 19:37:34 CEST 2020
ls -l latest-all.nt
-rw-r--r--  1 wf  admin  2180458524345 Aug 13 20:22 latest-all.nt

Count lines

date;wc -l latest-all.nt;date
Mon Aug 17 07:49:38 CEST 2020
 13854100336 latest-all.nt
Mon Aug 17 11:07:07 CEST 2020

Start and progress

egrep "000\.000\.000" -A1  tdb2-err.log
nohup ./wikidata2jena&
tail -f tdb2-err.log 
19:54:45 INFO  loader          :: Loader = LoaderPhased
19:54:45 INFO  loader          :: Start: latest-all.nt
19:54:50 INFO  loader          :: Add: 500.000 latest-all.nt (Batch: 95.474 / Avg: 95.474)
19:54:53 INFO  loader          :: Add: 1.000.000 latest-all.nt (Batch: 186.846 / Avg: 126.374)
19:54:55 INFO  loader          :: Add: 1.500.000 latest-all.nt (Batch: 207.986 / Avg: 145.391)
19:54:58 INFO  loader          :: Add: 2.000.000 latest-all.nt (Batch: 166.500 / Avg: 150.150)
19:55:03 INFO  loader          :: Add: 2.500.000 latest-all.nt (Batch: 100.603 / Avg: 136.686)
19:55:08 INFO  loader          :: Add: 3.000.000 latest-all.nt (Batch: 100.745 / Avg: 129.015)
19:55:13 INFO  loader          :: Add: 3.500.000 latest-all.nt (Batch: 99.226 / Avg: 123.709)
19:55:18 INFO  loader          :: Add: 4.000.000 latest-all.nt (Batch: 99.383 / Avg: 120.037)
19:55:24 INFO  loader          :: Add: 4.500.000 latest-all.nt (Batch: 94.589 / Avg: 116.553)
19:55:29 INFO  loader          :: Add: 5.000.000 latest-all.nt (Batch: 89.269 / Avg: 113.096)
19:55:29 INFO  loader          ::   Elapsed: 44,21 seconds [2020/08/16 19:55:29 MESZ]
19:55:35 INFO  loader          :: Add: 5.500.000 latest-all.nt (Batch: 93.861 / Avg: 111.028)
...
22:02:30 INFO  loader          :: Add: 1.000.000.000 latest-all.nt (Batch: 103.199 / Avg: 130.468)
22:02:30 INFO  loader          ::   Elapsed: 7.664,71 seconds [2020/08/16 22:02:30 MESZ]
...
00:30:49 INFO  loader          :: Add: 2.000.000.000 latest-all.nt (Batch: 117.647 / Avg: 120.741)
00:30:49 INFO  loader          ::   Elapsed: 16.564,27 seconds [2020/08/17 00:30:49 MESZ]
--
03:00:51 INFO  loader          :: Add: 3.000.000.000 latest-all.nt (Batch: 112.688 / Avg: 117.341)
03:00:51 INFO  loader          ::   Elapsed: 25.566,39 seconds [2020/08/17 03:00:51 MESZ]
--
06:33:58 INFO  loader          :: Add: 4.000.000.000 latest-all.nt (Batch: 41.569 / Avg: 104.293)
06:33:58 INFO  loader          ::   Elapsed: 38.353,15 seconds [2020/08/17 06:33:58 MESZ]
...
18:15:56 INFO  loader          :: Add: 5.000.000.000 latest-all.nt (Batch: 26.435 / Avg: 62.134)
18:15:56 INFO  loader          ::   Elapsed: 80.471,07 seconds [2020/08/17 18:15:56 MESZ]
...
01:36:06 INFO  loader          :: Add: 6.000.000.000 latest-all.nt (Batch: 46.027 / Avg: 56.136)
01:36:06 INFO  loader          ::   Elapsed: 106.881,45 seconds [2020/08/18 01:36:06 MESZ]
...
07:45:01 INFO  loader          :: Add: 7.000.000.000 latest-all.nt (Batch: 40.769 / Avg: 54.257)
07:45:01 INFO  loader          ::   Elapsed: 129.015,49 seconds [2020/08/18 07:45:01 MESZ]
...
14:49:08 INFO  loader          :: Add: 8.000.000.000 latest-all.nt (Batch: 39.607 / Avg: 51.792)
14:49:08 INFO  loader          ::   Elapsed: 154.462,80 seconds [2020/08/18 14:49:08 MESZ]
...
23:42:25 INFO  loader          :: Add: 9.000.000.000 latest-all.nt (Batch: 30.543 / Avg: 48.267)
23:42:25 INFO  loader          ::   Elapsed: 186.460,13 seconds [2020/08/18 23:42:25 MESZ]
...
09:28:31 INFO  loader          :: Add: 10.000.000.000 latest-all.nt (Batch: 33.464 / Avg: 45.121)
09:28:31 INFO  loader          ::   Elapsed: 221.625,86 seconds [2020/08/19 09:28:31 MESZ]
...
20:20:57 INFO  loader          :: Add: 11.000.000.000 latest-all.nt (Batch: 26.867 / Avg: 42.182)
20:20:57 INFO  loader          ::   Elapsed: 260.772,17 seconds [2020/08/19 20:20:57 MESZ]
...
08:01:14 INFO  loader          :: Add: 12.000.000.000 latest-all.nt (Batch: 25.588 / Avg: 39.631)
08:01:14 INFO  loader          ::   Elapsed: 302.788,94 seconds [2020/08/20 08:01:14 MESZ]
...
20:38:09 INFO  loader          :: Add: 13.000.000.000 latest-all.nt (Batch: 24.601 / Avg: 37.334)
20:38:09 INFO  loader          ::   Elapsed: 348.203,59 seconds [2020/08/20 20:38:09 MESZ]
...
07:58:23 INFO  loader          :: Add: 13.850.000.000 latest-all.nt (Batch: 18.268 / Avg: 35.602)
07:58:23 INFO  loader          ::   Elapsed: 389.017,56 seconds [2020/08/21 07:58:23 MESZ]
--
08:12:49 INFO  loader          :: Add: 10.000.000 Index (Batch: 175.284 / Avg: 199.668)
08:12:49 INFO  loader          ::   Elapsed: 50,08 seconds [2020/08/21 08:12:49 MESZ]
--
08:22:19 INFO  loader          :: Add: 110.000.000 Index (Batch: 190.114 / Avg: 177.441)
08:22:19 INFO  loader          ::   Elapsed: 619,92 seconds [2020/08/21 08:22:19 MESZ]
...
11:32:39 INFO  loader          :: Add: 1.000.000.000 Index (Batch: 45.949 / Avg: 83.060)
11:32:39 INFO  loader          ::   Elapsed: 12.039,42 seconds [2020/08/21 11:32:39 MESZ]
...
06:36:51 INFO  loader          :: Add: 2.000.000.000 Index (Batch: 11.018 / Avg: 24.785)
06:36:51 INFO  loader          ::   Elapsed: 80.691,88 seconds [2020/08/22 06:36:51 MESZ]
...
19:31:22 INFO  loader          :: Add: 3.000.000.000 Index (Batch: 47.892 / Avg: 23.591)
19:31:22 INFO  loader          ::   Elapsed: 127.162,89 seconds [2020/08/22 19:31:22 MESZ]
...
06:57:00 INFO  loader          :: Add: 4.000.000.000 Index (Batch: 22.606 / Avg: 23.767)
06:57:00 INFO  loader          ::   Elapsed: 168.300,31 seconds [2020/08/23 06:57:00 MESZ]
...
15:02:37 INFO  loader          :: Add: 5.000.000.000 Index (Batch: 35.418 / Avg: 25.324)
15:02:37 INFO  loader          ::   Elapsed: 197.437,88 seconds [2020/08/23 15:02:37 MESZ]
...
23:55:44 INFO  loader          :: Add: 6.000.000.000 Index (Batch: 62.790 / Avg: 26.152)
23:55:44 INFO  loader          ::   Elapsed: 229.424,58 seconds [2020/08/23 23:55:44 MESZ]
...
07:56:25 INFO  loader          :: Add: 7.000.000.000 Index (Batch: 36.004 / Avg: 27.103)
07:56:25 INFO  loader          ::   Elapsed: 258.265,50 seconds [2020/08/24 07:56:25 MESZ]
...
15:16:24 INFO  loader          :: Add: 8.000.000.000 Index (Batch: 33.847 / Avg: 28.103)
15:16:24 INFO  loader          ::   Elapsed: 284.664,53 seconds [2020/08/24 15:16:24 MESZ]
...
23:47:53 INFO  loader          :: Add: 9.000.000.000 Index (Batch: 56.129 / Avg: 28.539)
23:47:53 INFO  loader          ::   Elapsed: 315.353,50 seconds [2020/08/24 23:47:53 MESZ]
...
07:46:36 INFO  loader          :: Add: 10.000.000.000 Index (Batch: 30.033 / Avg: 29.063)
07:46:36 INFO  loader          ::   Elapsed: 344.076,53 seconds [2020/08/25 07:46:36 MESZ]
...
16:00:04 INFO  loader          :: Add: 11.000.000.000 Index (Batch: 55.282 / Avg: 29.436)
16:00:04 INFO  loader          ::   Elapsed: 373.684,63 seconds [2020/08/25 16:00:04 MESZ]
...
01:35:49 INFO  loader          :: Add: 12.000.000.000 Index (Batch: 36.424 / Avg: 29.395)
01:35:50 INFO  loader          ::   Elapsed: 408.230,25 seconds [2020/08/26 01:35:49 MESZ]
...
11:17:08 INFO  loader          :: Add: 13.000.000.000 Index (Batch: 26.727 / Avg: 29.338)
11:17:08 INFO  loader          ::   Elapsed: 443.108,84 seconds [2020/08/26 11:17:08 MESZ]
...
16:55:00 INFO  loader          :: Add: 13.620.000.000 Index (Batch: 37.603 / Avg: 29.392)
16:55:00 INFO  loader          ::   Elapsed: 463.380,28 seconds [2020/08/26 16:55:00 MESZ]
...
17:25:28 INFO  loader          :: Time = 855.042,638 seconds : Triples = 13.854.100.336 : Rate = 16.203 /s

disk usage

At 13.6 billion indexed triples:

du -sm *
2002139	data
0	tmp

When finished:

du -sm data/
2003251	data/

Scripts

wikidata2jena

#!/bin/bash
# WF 2020-05-10

# global settings
jena=apache-jena-3.16.0
tgz=$jena.tar.gz
jenaurl=http://mirror.easyname.ch/apache/jena/binaries/$tgz
base=/Volumes/Torterra/wikidata2020-08-15
data=$base/data
tdbloader=$jena/bin/tdb2.tdbloader

getjena() {
# download
if [ ! -f $tgz ]
then
  echo "downloading $tgz from $jenaurl"
	wget $jenaurl
else
  echo "$tgz already downloaded"
fi
# unpack
if [ ! -d $jena ]
then
  echo "unpacking $jena from $tgz"
	tar xvzf $tgz
else
  echo "$jena already unpacked"
fi
# create data directory
if [ ! -d $data ]
then
  echo "creating $data directory"
  mkdir -p $data
else
  echo "$data directory already created"
fi
}

#
# show the given timestamp
#
timestamp() {
 local msg="$1"
 local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
 echo "$msg at $ts"
}

#
# load data for the given data dir and input
#
loaddata() {
	local data="$1"
	local input="$2"
  timestamp "start loading $input to $data"
  $tdbloader --loc "$data" "$input" > tdb2-out.log 2> tdb2-err.log
	timestamp "finished loading $input to $data"
}

getjena
export TMPDIR=$base/tmp
if [ ! -d $TMPDIR ]
then
  echo "creating temporary directory $TMPDIR"
  mkdir $TMPDIR
else
  echo "using temporary directory $TMPDIR"
fi
loaddata $data latest-all.nt

check extract speed

./speed 
149 MB in 3 s =  49 MB/s = 172 GB/h
315/2014 GB (15 %) todo: 1699 GB
ETA 9.9 h

speed

#!/bin/bash
# WF 2020-08-16
# check extract speed

#
# check the speed of extraction
#
checkSpeed() {
  local l_secs="$1"
	local l_total="$2"
	totalgb=$(expr $l_total / 1024 / 1024 / 1024)
  first=$(du -sm latest-all.nt | cut -f1)
  sleep $l_secs 
  second=$(du -sm latest-all.nt | cut -f1)
  mbytes=$(expr $second - $first)
	mpersec=$(expr $mbytes / $l_secs)
	gperh=$(expr $mpersec \* 3600 / 1024)
	echo "$mbytes MB in $l_secs s =  $mpersec MB/s = $gperh GB/h"
	donegb=$(expr $second / 1024)
	todogb=$(expr $totalgb - $donegb)
	percent=$(expr $donegb \* 100 / $totalgb)
	eta=$(expr $todogb / $gperh)
	echo "$donegb/$totalgb GB ($percent %) todo: $todogb GB"
	awk -v todo=$todogb -v gperh=$gperh 'BEGIN { printf("ETA %.1f h\n",todo/gperh) }'
}

checkSpeed 3 2162713035569