WikiData Import 2020-08-15
Environment
- Mac Pro Mid 2010
- 12 core 3.46 GHz
- 64 GB RAM
- macOS High Sierra 10.13.6
- Source Disk: 4 TB 7200 rpm hard disk WD Gold WDC WD4002FYYZ Blackmagic speed rating: 175 MB/s write 175 MB/s read
- Target Disk: 4 TB SSD Samsung 860 EVO Blackmagic speed rating: 257 MB/s write 270 MB/s read
java -version openjdk version "11.0.5" 2019-10-15 OpenJDK Runtime Environment AdoptOpenJDK (build 11.0.5+10) OpenJDK 64-Bit Server VM AdoptOpenJDK (build 11.0.5+10, mixed mode)
Summary
Download and unpack
Downloading took some 7 h unpacking took some 12h 30
date;wget https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2;date
Sat Aug 15 17:47:23 CEST 2020
--2020-08-15 17:47:24-- https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620::861:1:208:80:154:7, 208.80.154.7
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620::861:1:208:80:154:7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 119725690213 (112G) [application/octet-stream]
Saving to: ‘latest-all.nt.bz2’
latest-all.nt.bz2 0%[ ] 34.15M 4.61MB/s eta 6h 58m
...
latest-all.nt.bz2 100%[===================================>] 111.50G 4.88MB/s in 6h 49m
2020-08-16 00:37:06 (4.64 MB/s) - ‘latest-all.nt.bz2’ saved [119725690213/119725690213]
Sun Aug 16 00:37:06 CEST 2020
date;bzip2 -dk latest-all.nt.bz2;date
Sun Aug 16 07:02:30 CEST 2020
Sun Aug 16 19:37:34 CEST 2020
Start and progress
nohup ./wikidata2jena&
tail -f tdb2-err.log
19:54:45 INFO loader :: Loader = LoaderPhased
19:54:45 INFO loader :: Start: latest-all.nt
19:54:50 INFO loader :: Add: 500.000 latest-all.nt (Batch: 95.474 / Avg: 95.474)
19:54:53 INFO loader :: Add: 1.000.000 latest-all.nt (Batch: 186.846 / Avg: 126.374)
19:54:55 INFO loader :: Add: 1.500.000 latest-all.nt (Batch: 207.986 / Avg: 145.391)
19:54:58 INFO loader :: Add: 2.000.000 latest-all.nt (Batch: 166.500 / Avg: 150.150)
19:55:03 INFO loader :: Add: 2.500.000 latest-all.nt (Batch: 100.603 / Avg: 136.686)
19:55:08 INFO loader :: Add: 3.000.000 latest-all.nt (Batch: 100.745 / Avg: 129.015)
19:55:13 INFO loader :: Add: 3.500.000 latest-all.nt (Batch: 99.226 / Avg: 123.709)
19:55:18 INFO loader :: Add: 4.000.000 latest-all.nt (Batch: 99.383 / Avg: 120.037)
19:55:24 INFO loader :: Add: 4.500.000 latest-all.nt (Batch: 94.589 / Avg: 116.553)
19:55:29 INFO loader :: Add: 5.000.000 latest-all.nt (Batch: 89.269 / Avg: 113.096)
19:55:29 INFO loader :: Elapsed: 44,21 seconds [2020/08/16 19:55:29 MESZ]
19:55:35 INFO loader :: Add: 5.500.000 latest-all.nt (Batch: 93.861 / Avg: 111.028)
Scripts
check extract speed
./speed
149 MB in 3 s = 49 MB/s = 172 GB/h
315/2014 GB (15 %) todo: 1699 GB
ETA 9.9 h
speed
#!/bin/bash
# WF 2020-08-16
# check extract speed
#
# check the speed of extraction
#
checkSpeed() {
local l_secs="$1"
local l_total="$2"
totalgb=$(expr $l_total / 1024 / 1024 / 1024)
first=$(du -sm latest-all.nt | cut -f1)
sleep $l_secs
second=$(du -sm latest-all.nt | cut -f1)
mbytes=$(expr $second - $first)
mpersec=$(expr $mbytes / $l_secs)
gperh=$(expr $mpersec \* 3600 / 1024)
echo "$mbytes MB in $l_secs s = $mpersec MB/s = $gperh GB/h"
donegb=$(expr $second / 1024)
todogb=$(expr $totalgb - $donegb)
percent=$(expr $donegb \* 100 / $totalgb)
eta=$(expr $todogb / $gperh)
echo "$donegb/$totalgb GB ($percent %) todo: $todogb GB"
awk -v todo=$todogb -v gperh=$gperh 'BEGIN { printf("ETA %.1f h\n",todo/gperh) }'
}
checkSpeed 3 2162713035569