Difference between revisions of "WikiData Import 2020-08-15"
Jump to navigation
Jump to search
(51 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
+ | see also {{Link|target=Get_your_own_copy_of_WikiData}} - this was the seventh attempt after five failures and one {{Link|target=WikiData_Import_2020-07-15|title=success}} with a truthy datset. The main success factor was to use a 4 TB SSD disk which | ||
+ | was kindly supplied by the [https://projects.tib.eu/en/confident/ ConfIDent project] | ||
= Environment = | = Environment = | ||
# Mac Pro Mid 2010 | # Mac Pro Mid 2010 | ||
Line 13: | Line 15: | ||
</source> | </source> | ||
= Summary = | = Summary = | ||
+ | # trying to replicate success story of https://issues.apache.org/jira/projects/JENA/issues/JENA-1909 | ||
+ | # download of 111 GB took some 7 h | ||
+ | # unzipping to some 2030 GB took some 12 h 30 | ||
+ | # counting 13.9 billion lines took some 3 h 20 | ||
+ | # phased import of some 13.9 billion lines is in loading phase 2 (index) at some 13.6 billion lines after some 10 days phase 1 loading took 4.5 days phase 2 indexing is at day 5.5 | ||
+ | |||
= Download and unpack = | = Download and unpack = | ||
− | <source lang='bash' highlight='1,14'> | + | Downloading took some 7 h |
+ | unpacking took some 12h 30 | ||
+ | <source lang='bash' highlight='1,14,17'> | ||
date;wget https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2;date | date;wget https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2;date | ||
Sat Aug 15 17:47:23 CEST 2020 | Sat Aug 15 17:47:23 CEST 2020 | ||
Line 30: | Line 40: | ||
date;bzip2 -dk latest-all.nt.bz2;date | date;bzip2 -dk latest-all.nt.bz2;date | ||
Sun Aug 16 07:02:30 CEST 2020 | Sun Aug 16 07:02:30 CEST 2020 | ||
+ | Sun Aug 16 19:37:34 CEST 2020 | ||
+ | ls -l latest-all.nt | ||
+ | -rw-r--r-- 1 wf admin 2180458524345 Aug 13 20:22 latest-all.nt | ||
+ | </source> | ||
+ | |||
+ | = Count lines = | ||
+ | <source lang='bash'> | ||
+ | date;wc -l latest-all.nt;date | ||
+ | Mon Aug 17 07:49:38 CEST 2020 | ||
+ | 13854100336 latest-all.nt | ||
+ | Mon Aug 17 11:07:07 CEST 2020 | ||
+ | </source> | ||
+ | = Start and progress = | ||
+ | <source lang='bash' highlight='1'> | ||
+ | egrep "000\.000\.000" -A1 tdb2-err.log | ||
+ | </source> | ||
+ | |||
+ | <source lang='bash' highlight='1-2'> | ||
+ | nohup ./wikidata2jena& | ||
+ | tail -f tdb2-err.log | ||
+ | 19:54:45 INFO loader :: Loader = LoaderPhased | ||
+ | 19:54:45 INFO loader :: Start: latest-all.nt | ||
+ | 19:54:50 INFO loader :: Add: 500.000 latest-all.nt (Batch: 95.474 / Avg: 95.474) | ||
+ | 19:54:53 INFO loader :: Add: 1.000.000 latest-all.nt (Batch: 186.846 / Avg: 126.374) | ||
+ | 19:54:55 INFO loader :: Add: 1.500.000 latest-all.nt (Batch: 207.986 / Avg: 145.391) | ||
+ | 19:54:58 INFO loader :: Add: 2.000.000 latest-all.nt (Batch: 166.500 / Avg: 150.150) | ||
+ | 19:55:03 INFO loader :: Add: 2.500.000 latest-all.nt (Batch: 100.603 / Avg: 136.686) | ||
+ | 19:55:08 INFO loader :: Add: 3.000.000 latest-all.nt (Batch: 100.745 / Avg: 129.015) | ||
+ | 19:55:13 INFO loader :: Add: 3.500.000 latest-all.nt (Batch: 99.226 / Avg: 123.709) | ||
+ | 19:55:18 INFO loader :: Add: 4.000.000 latest-all.nt (Batch: 99.383 / Avg: 120.037) | ||
+ | 19:55:24 INFO loader :: Add: 4.500.000 latest-all.nt (Batch: 94.589 / Avg: 116.553) | ||
+ | 19:55:29 INFO loader :: Add: 5.000.000 latest-all.nt (Batch: 89.269 / Avg: 113.096) | ||
+ | 19:55:29 INFO loader :: Elapsed: 44,21 seconds [2020/08/16 19:55:29 MESZ] | ||
+ | 19:55:35 INFO loader :: Add: 5.500.000 latest-all.nt (Batch: 93.861 / Avg: 111.028) | ||
+ | ... | ||
+ | 22:02:30 INFO loader :: Add: 1.000.000.000 latest-all.nt (Batch: 103.199 / Avg: 130.468) | ||
+ | 22:02:30 INFO loader :: Elapsed: 7.664,71 seconds [2020/08/16 22:02:30 MESZ] | ||
+ | ... | ||
+ | 00:30:49 INFO loader :: Add: 2.000.000.000 latest-all.nt (Batch: 117.647 / Avg: 120.741) | ||
+ | 00:30:49 INFO loader :: Elapsed: 16.564,27 seconds [2020/08/17 00:30:49 MESZ] | ||
+ | -- | ||
+ | 03:00:51 INFO loader :: Add: 3.000.000.000 latest-all.nt (Batch: 112.688 / Avg: 117.341) | ||
+ | 03:00:51 INFO loader :: Elapsed: 25.566,39 seconds [2020/08/17 03:00:51 MESZ] | ||
+ | -- | ||
+ | 06:33:58 INFO loader :: Add: 4.000.000.000 latest-all.nt (Batch: 41.569 / Avg: 104.293) | ||
+ | 06:33:58 INFO loader :: Elapsed: 38.353,15 seconds [2020/08/17 06:33:58 MESZ] | ||
+ | ... | ||
+ | 18:15:56 INFO loader :: Add: 5.000.000.000 latest-all.nt (Batch: 26.435 / Avg: 62.134) | ||
+ | 18:15:56 INFO loader :: Elapsed: 80.471,07 seconds [2020/08/17 18:15:56 MESZ] | ||
+ | ... | ||
+ | 01:36:06 INFO loader :: Add: 6.000.000.000 latest-all.nt (Batch: 46.027 / Avg: 56.136) | ||
+ | 01:36:06 INFO loader :: Elapsed: 106.881,45 seconds [2020/08/18 01:36:06 MESZ] | ||
+ | ... | ||
+ | 07:45:01 INFO loader :: Add: 7.000.000.000 latest-all.nt (Batch: 40.769 / Avg: 54.257) | ||
+ | 07:45:01 INFO loader :: Elapsed: 129.015,49 seconds [2020/08/18 07:45:01 MESZ] | ||
+ | ... | ||
+ | 14:49:08 INFO loader :: Add: 8.000.000.000 latest-all.nt (Batch: 39.607 / Avg: 51.792) | ||
+ | 14:49:08 INFO loader :: Elapsed: 154.462,80 seconds [2020/08/18 14:49:08 MESZ] | ||
+ | ... | ||
+ | 23:42:25 INFO loader :: Add: 9.000.000.000 latest-all.nt (Batch: 30.543 / Avg: 48.267) | ||
+ | 23:42:25 INFO loader :: Elapsed: 186.460,13 seconds [2020/08/18 23:42:25 MESZ] | ||
+ | ... | ||
+ | 09:28:31 INFO loader :: Add: 10.000.000.000 latest-all.nt (Batch: 33.464 / Avg: 45.121) | ||
+ | 09:28:31 INFO loader :: Elapsed: 221.625,86 seconds [2020/08/19 09:28:31 MESZ] | ||
+ | ... | ||
+ | 20:20:57 INFO loader :: Add: 11.000.000.000 latest-all.nt (Batch: 26.867 / Avg: 42.182) | ||
+ | 20:20:57 INFO loader :: Elapsed: 260.772,17 seconds [2020/08/19 20:20:57 MESZ] | ||
+ | ... | ||
+ | 08:01:14 INFO loader :: Add: 12.000.000.000 latest-all.nt (Batch: 25.588 / Avg: 39.631) | ||
+ | 08:01:14 INFO loader :: Elapsed: 302.788,94 seconds [2020/08/20 08:01:14 MESZ] | ||
+ | ... | ||
+ | 20:38:09 INFO loader :: Add: 13.000.000.000 latest-all.nt (Batch: 24.601 / Avg: 37.334) | ||
+ | 20:38:09 INFO loader :: Elapsed: 348.203,59 seconds [2020/08/20 20:38:09 MESZ] | ||
+ | ... | ||
+ | 07:58:23 INFO loader :: Add: 13.850.000.000 latest-all.nt (Batch: 18.268 / Avg: 35.602) | ||
+ | 07:58:23 INFO loader :: Elapsed: 389.017,56 seconds [2020/08/21 07:58:23 MESZ] | ||
+ | -- | ||
+ | 08:12:49 INFO loader :: Add: 10.000.000 Index (Batch: 175.284 / Avg: 199.668) | ||
+ | 08:12:49 INFO loader :: Elapsed: 50,08 seconds [2020/08/21 08:12:49 MESZ] | ||
+ | -- | ||
+ | 08:22:19 INFO loader :: Add: 110.000.000 Index (Batch: 190.114 / Avg: 177.441) | ||
+ | 08:22:19 INFO loader :: Elapsed: 619,92 seconds [2020/08/21 08:22:19 MESZ] | ||
+ | ... | ||
+ | 11:32:39 INFO loader :: Add: 1.000.000.000 Index (Batch: 45.949 / Avg: 83.060) | ||
+ | 11:32:39 INFO loader :: Elapsed: 12.039,42 seconds [2020/08/21 11:32:39 MESZ] | ||
+ | ... | ||
+ | 06:36:51 INFO loader :: Add: 2.000.000.000 Index (Batch: 11.018 / Avg: 24.785) | ||
+ | 06:36:51 INFO loader :: Elapsed: 80.691,88 seconds [2020/08/22 06:36:51 MESZ] | ||
+ | ... | ||
+ | 19:31:22 INFO loader :: Add: 3.000.000.000 Index (Batch: 47.892 / Avg: 23.591) | ||
+ | 19:31:22 INFO loader :: Elapsed: 127.162,89 seconds [2020/08/22 19:31:22 MESZ] | ||
+ | ... | ||
+ | 06:57:00 INFO loader :: Add: 4.000.000.000 Index (Batch: 22.606 / Avg: 23.767) | ||
+ | 06:57:00 INFO loader :: Elapsed: 168.300,31 seconds [2020/08/23 06:57:00 MESZ] | ||
+ | ... | ||
+ | 15:02:37 INFO loader :: Add: 5.000.000.000 Index (Batch: 35.418 / Avg: 25.324) | ||
+ | 15:02:37 INFO loader :: Elapsed: 197.437,88 seconds [2020/08/23 15:02:37 MESZ] | ||
+ | ... | ||
+ | 23:55:44 INFO loader :: Add: 6.000.000.000 Index (Batch: 62.790 / Avg: 26.152) | ||
+ | 23:55:44 INFO loader :: Elapsed: 229.424,58 seconds [2020/08/23 23:55:44 MESZ] | ||
+ | ... | ||
+ | 07:56:25 INFO loader :: Add: 7.000.000.000 Index (Batch: 36.004 / Avg: 27.103) | ||
+ | 07:56:25 INFO loader :: Elapsed: 258.265,50 seconds [2020/08/24 07:56:25 MESZ] | ||
+ | ... | ||
+ | 15:16:24 INFO loader :: Add: 8.000.000.000 Index (Batch: 33.847 / Avg: 28.103) | ||
+ | 15:16:24 INFO loader :: Elapsed: 284.664,53 seconds [2020/08/24 15:16:24 MESZ] | ||
+ | ... | ||
+ | 23:47:53 INFO loader :: Add: 9.000.000.000 Index (Batch: 56.129 / Avg: 28.539) | ||
+ | 23:47:53 INFO loader :: Elapsed: 315.353,50 seconds [2020/08/24 23:47:53 MESZ] | ||
+ | ... | ||
+ | 07:46:36 INFO loader :: Add: 10.000.000.000 Index (Batch: 30.033 / Avg: 29.063) | ||
+ | 07:46:36 INFO loader :: Elapsed: 344.076,53 seconds [2020/08/25 07:46:36 MESZ] | ||
+ | ... | ||
+ | 16:00:04 INFO loader :: Add: 11.000.000.000 Index (Batch: 55.282 / Avg: 29.436) | ||
+ | 16:00:04 INFO loader :: Elapsed: 373.684,63 seconds [2020/08/25 16:00:04 MESZ] | ||
+ | ... | ||
+ | 01:35:49 INFO loader :: Add: 12.000.000.000 Index (Batch: 36.424 / Avg: 29.395) | ||
+ | 01:35:50 INFO loader :: Elapsed: 408.230,25 seconds [2020/08/26 01:35:49 MESZ] | ||
+ | ... | ||
+ | 11:17:08 INFO loader :: Add: 13.000.000.000 Index (Batch: 26.727 / Avg: 29.338) | ||
+ | 11:17:08 INFO loader :: Elapsed: 443.108,84 seconds [2020/08/26 11:17:08 MESZ] | ||
+ | ... | ||
+ | 16:55:00 INFO loader :: Add: 13.620.000.000 Index (Batch: 37.603 / Avg: 29.392) | ||
+ | 16:55:00 INFO loader :: Elapsed: 463.380,28 seconds [2020/08/26 16:55:00 MESZ] | ||
+ | ... | ||
+ | 17:25:28 INFO loader :: Time = 855.042,638 seconds : Triples = 13.854.100.336 : Rate = 16.203 /s | ||
+ | </source> | ||
+ | |||
+ | == disk usage == | ||
+ | At 13.6 billion indexed triples: | ||
+ | <source lang='bash'> | ||
+ | du -sm * | ||
+ | 2002139 data | ||
+ | 0 tmp | ||
+ | </source> | ||
+ | When finished: | ||
+ | <source lang='bash'> | ||
+ | du -sm data/ | ||
+ | 2003251 data/ | ||
+ | </source> | ||
+ | |||
+ | = Scripts = | ||
+ | == wikidata2jena == | ||
+ | <source lang='bash'> | ||
+ | #!/bin/bash | ||
+ | # WF 2020-05-10 | ||
+ | |||
+ | # global settings | ||
+ | jena=apache-jena-3.16.0 | ||
+ | tgz=$jena.tar.gz | ||
+ | jenaurl=http://mirror.easyname.ch/apache/jena/binaries/$tgz | ||
+ | base=/Volumes/Torterra/wikidata2020-08-15 | ||
+ | data=$base/data | ||
+ | tdbloader=$jena/bin/tdb2.tdbloader | ||
+ | |||
+ | getjena() { | ||
+ | # download | ||
+ | if [ ! -f $tgz ] | ||
+ | then | ||
+ | echo "downloading $tgz from $jenaurl" | ||
+ | wget $jenaurl | ||
+ | else | ||
+ | echo "$tgz already downloaded" | ||
+ | fi | ||
+ | # unpack | ||
+ | if [ ! -d $jena ] | ||
+ | then | ||
+ | echo "unpacking $jena from $tgz" | ||
+ | tar xvzf $tgz | ||
+ | else | ||
+ | echo "$jena already unpacked" | ||
+ | fi | ||
+ | # create data directory | ||
+ | if [ ! -d $data ] | ||
+ | then | ||
+ | echo "creating $data directory" | ||
+ | mkdir -p $data | ||
+ | else | ||
+ | echo "$data directory already created" | ||
+ | fi | ||
+ | } | ||
+ | |||
+ | # | ||
+ | # show the given timestamp | ||
+ | # | ||
+ | timestamp() { | ||
+ | local msg="$1" | ||
+ | local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ") | ||
+ | echo "$msg at $ts" | ||
+ | } | ||
+ | |||
+ | # | ||
+ | # load data for the given data dir and input | ||
+ | # | ||
+ | loaddata() { | ||
+ | local data="$1" | ||
+ | local input="$2" | ||
+ | timestamp "start loading $input to $data" | ||
+ | $tdbloader --loc "$data" "$input" > tdb2-out.log 2> tdb2-err.log | ||
+ | timestamp "finished loading $input to $data" | ||
+ | } | ||
+ | |||
+ | getjena | ||
+ | export TMPDIR=$base/tmp | ||
+ | if [ ! -d $TMPDIR ] | ||
+ | then | ||
+ | echo "creating temporary directory $TMPDIR" | ||
+ | mkdir $TMPDIR | ||
+ | else | ||
+ | echo "using temporary directory $TMPDIR" | ||
+ | fi | ||
+ | loaddata $data latest-all.nt | ||
+ | </source> | ||
+ | |||
+ | == check extract speed == | ||
+ | <source lang='bash'> | ||
+ | ./speed | ||
+ | 149 MB in 3 s = 49 MB/s = 172 GB/h | ||
+ | 315/2014 GB (15 %) todo: 1699 GB | ||
+ | ETA 9.9 h | ||
+ | </source> | ||
+ | === speed === | ||
+ | <source lang='bash'> | ||
+ | #!/bin/bash | ||
+ | # WF 2020-08-16 | ||
+ | # check extract speed | ||
+ | |||
+ | # | ||
+ | # check the speed of extraction | ||
+ | # | ||
+ | checkSpeed() { | ||
+ | local l_secs="$1" | ||
+ | local l_total="$2" | ||
+ | totalgb=$(expr $l_total / 1024 / 1024 / 1024) | ||
+ | first=$(du -sm latest-all.nt | cut -f1) | ||
+ | sleep $l_secs | ||
+ | second=$(du -sm latest-all.nt | cut -f1) | ||
+ | mbytes=$(expr $second - $first) | ||
+ | mpersec=$(expr $mbytes / $l_secs) | ||
+ | gperh=$(expr $mpersec \* 3600 / 1024) | ||
+ | echo "$mbytes MB in $l_secs s = $mpersec MB/s = $gperh GB/h" | ||
+ | donegb=$(expr $second / 1024) | ||
+ | todogb=$(expr $totalgb - $donegb) | ||
+ | percent=$(expr $donegb \* 100 / $totalgb) | ||
+ | eta=$(expr $todogb / $gperh) | ||
+ | echo "$donegb/$totalgb GB ($percent %) todo: $todogb GB" | ||
+ | awk -v todo=$todogb -v gperh=$gperh 'BEGIN { printf("ETA %.1f h\n",todo/gperh) }' | ||
+ | } | ||
+ | |||
+ | checkSpeed 3 2162713035569 | ||
</source> | </source> | ||
[[Category:WikiData]] | [[Category:WikiData]] |
Latest revision as of 06:26, 11 September 2020
see also Get_your_own_copy_of_WikiData - this was the seventh attempt after five failures and one success with a truthy datset. The main success factor was to use a 4 TB SSD disk which was kindly supplied by the ConfIDent project
Environment
- Mac Pro Mid 2010
- 12 core 3.46 GHz
- 64 GB RAM
- macOS High Sierra 10.13.6
- Source Disk: 4 TB 7200 rpm hard disk WD Gold WDC WD4002FYYZ Blackmagic speed rating: 175 MB/s write 175 MB/s read
- Target Disk: 4 TB SSD Samsung 860 EVO Blackmagic speed rating: 257 MB/s write 270 MB/s read
java -version openjdk version "11.0.5" 2019-10-15 OpenJDK Runtime Environment AdoptOpenJDK (build 11.0.5+10) OpenJDK 64-Bit Server VM AdoptOpenJDK (build 11.0.5+10, mixed mode)
Summary
- trying to replicate success story of https://issues.apache.org/jira/projects/JENA/issues/JENA-1909
- download of 111 GB took some 7 h
- unzipping to some 2030 GB took some 12 h 30
- counting 13.9 billion lines took some 3 h 20
- phased import of some 13.9 billion lines is in loading phase 2 (index) at some 13.6 billion lines after some 10 days phase 1 loading took 4.5 days phase 2 indexing is at day 5.5
Download and unpack
Downloading took some 7 h unpacking took some 12h 30
date;wget https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2;date
Sat Aug 15 17:47:23 CEST 2020
--2020-08-15 17:47:24-- https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620::861:1:208:80:154:7, 208.80.154.7
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620::861:1:208:80:154:7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 119725690213 (112G) [application/octet-stream]
Saving to: ‘latest-all.nt.bz2’
latest-all.nt.bz2 0%[ ] 34.15M 4.61MB/s eta 6h 58m
...
latest-all.nt.bz2 100%[===================================>] 111.50G 4.88MB/s in 6h 49m
2020-08-16 00:37:06 (4.64 MB/s) - ‘latest-all.nt.bz2’ saved [119725690213/119725690213]
Sun Aug 16 00:37:06 CEST 2020
date;bzip2 -dk latest-all.nt.bz2;date
Sun Aug 16 07:02:30 CEST 2020
Sun Aug 16 19:37:34 CEST 2020
ls -l latest-all.nt
-rw-r--r-- 1 wf admin 2180458524345 Aug 13 20:22 latest-all.nt
Count lines
date;wc -l latest-all.nt;date
Mon Aug 17 07:49:38 CEST 2020
13854100336 latest-all.nt
Mon Aug 17 11:07:07 CEST 2020
Start and progress
egrep "000\.000\.000" -A1 tdb2-err.log
nohup ./wikidata2jena&
tail -f tdb2-err.log
19:54:45 INFO loader :: Loader = LoaderPhased
19:54:45 INFO loader :: Start: latest-all.nt
19:54:50 INFO loader :: Add: 500.000 latest-all.nt (Batch: 95.474 / Avg: 95.474)
19:54:53 INFO loader :: Add: 1.000.000 latest-all.nt (Batch: 186.846 / Avg: 126.374)
19:54:55 INFO loader :: Add: 1.500.000 latest-all.nt (Batch: 207.986 / Avg: 145.391)
19:54:58 INFO loader :: Add: 2.000.000 latest-all.nt (Batch: 166.500 / Avg: 150.150)
19:55:03 INFO loader :: Add: 2.500.000 latest-all.nt (Batch: 100.603 / Avg: 136.686)
19:55:08 INFO loader :: Add: 3.000.000 latest-all.nt (Batch: 100.745 / Avg: 129.015)
19:55:13 INFO loader :: Add: 3.500.000 latest-all.nt (Batch: 99.226 / Avg: 123.709)
19:55:18 INFO loader :: Add: 4.000.000 latest-all.nt (Batch: 99.383 / Avg: 120.037)
19:55:24 INFO loader :: Add: 4.500.000 latest-all.nt (Batch: 94.589 / Avg: 116.553)
19:55:29 INFO loader :: Add: 5.000.000 latest-all.nt (Batch: 89.269 / Avg: 113.096)
19:55:29 INFO loader :: Elapsed: 44,21 seconds [2020/08/16 19:55:29 MESZ]
19:55:35 INFO loader :: Add: 5.500.000 latest-all.nt (Batch: 93.861 / Avg: 111.028)
...
22:02:30 INFO loader :: Add: 1.000.000.000 latest-all.nt (Batch: 103.199 / Avg: 130.468)
22:02:30 INFO loader :: Elapsed: 7.664,71 seconds [2020/08/16 22:02:30 MESZ]
...
00:30:49 INFO loader :: Add: 2.000.000.000 latest-all.nt (Batch: 117.647 / Avg: 120.741)
00:30:49 INFO loader :: Elapsed: 16.564,27 seconds [2020/08/17 00:30:49 MESZ]
--
03:00:51 INFO loader :: Add: 3.000.000.000 latest-all.nt (Batch: 112.688 / Avg: 117.341)
03:00:51 INFO loader :: Elapsed: 25.566,39 seconds [2020/08/17 03:00:51 MESZ]
--
06:33:58 INFO loader :: Add: 4.000.000.000 latest-all.nt (Batch: 41.569 / Avg: 104.293)
06:33:58 INFO loader :: Elapsed: 38.353,15 seconds [2020/08/17 06:33:58 MESZ]
...
18:15:56 INFO loader :: Add: 5.000.000.000 latest-all.nt (Batch: 26.435 / Avg: 62.134)
18:15:56 INFO loader :: Elapsed: 80.471,07 seconds [2020/08/17 18:15:56 MESZ]
...
01:36:06 INFO loader :: Add: 6.000.000.000 latest-all.nt (Batch: 46.027 / Avg: 56.136)
01:36:06 INFO loader :: Elapsed: 106.881,45 seconds [2020/08/18 01:36:06 MESZ]
...
07:45:01 INFO loader :: Add: 7.000.000.000 latest-all.nt (Batch: 40.769 / Avg: 54.257)
07:45:01 INFO loader :: Elapsed: 129.015,49 seconds [2020/08/18 07:45:01 MESZ]
...
14:49:08 INFO loader :: Add: 8.000.000.000 latest-all.nt (Batch: 39.607 / Avg: 51.792)
14:49:08 INFO loader :: Elapsed: 154.462,80 seconds [2020/08/18 14:49:08 MESZ]
...
23:42:25 INFO loader :: Add: 9.000.000.000 latest-all.nt (Batch: 30.543 / Avg: 48.267)
23:42:25 INFO loader :: Elapsed: 186.460,13 seconds [2020/08/18 23:42:25 MESZ]
...
09:28:31 INFO loader :: Add: 10.000.000.000 latest-all.nt (Batch: 33.464 / Avg: 45.121)
09:28:31 INFO loader :: Elapsed: 221.625,86 seconds [2020/08/19 09:28:31 MESZ]
...
20:20:57 INFO loader :: Add: 11.000.000.000 latest-all.nt (Batch: 26.867 / Avg: 42.182)
20:20:57 INFO loader :: Elapsed: 260.772,17 seconds [2020/08/19 20:20:57 MESZ]
...
08:01:14 INFO loader :: Add: 12.000.000.000 latest-all.nt (Batch: 25.588 / Avg: 39.631)
08:01:14 INFO loader :: Elapsed: 302.788,94 seconds [2020/08/20 08:01:14 MESZ]
...
20:38:09 INFO loader :: Add: 13.000.000.000 latest-all.nt (Batch: 24.601 / Avg: 37.334)
20:38:09 INFO loader :: Elapsed: 348.203,59 seconds [2020/08/20 20:38:09 MESZ]
...
07:58:23 INFO loader :: Add: 13.850.000.000 latest-all.nt (Batch: 18.268 / Avg: 35.602)
07:58:23 INFO loader :: Elapsed: 389.017,56 seconds [2020/08/21 07:58:23 MESZ]
--
08:12:49 INFO loader :: Add: 10.000.000 Index (Batch: 175.284 / Avg: 199.668)
08:12:49 INFO loader :: Elapsed: 50,08 seconds [2020/08/21 08:12:49 MESZ]
--
08:22:19 INFO loader :: Add: 110.000.000 Index (Batch: 190.114 / Avg: 177.441)
08:22:19 INFO loader :: Elapsed: 619,92 seconds [2020/08/21 08:22:19 MESZ]
...
11:32:39 INFO loader :: Add: 1.000.000.000 Index (Batch: 45.949 / Avg: 83.060)
11:32:39 INFO loader :: Elapsed: 12.039,42 seconds [2020/08/21 11:32:39 MESZ]
...
06:36:51 INFO loader :: Add: 2.000.000.000 Index (Batch: 11.018 / Avg: 24.785)
06:36:51 INFO loader :: Elapsed: 80.691,88 seconds [2020/08/22 06:36:51 MESZ]
...
19:31:22 INFO loader :: Add: 3.000.000.000 Index (Batch: 47.892 / Avg: 23.591)
19:31:22 INFO loader :: Elapsed: 127.162,89 seconds [2020/08/22 19:31:22 MESZ]
...
06:57:00 INFO loader :: Add: 4.000.000.000 Index (Batch: 22.606 / Avg: 23.767)
06:57:00 INFO loader :: Elapsed: 168.300,31 seconds [2020/08/23 06:57:00 MESZ]
...
15:02:37 INFO loader :: Add: 5.000.000.000 Index (Batch: 35.418 / Avg: 25.324)
15:02:37 INFO loader :: Elapsed: 197.437,88 seconds [2020/08/23 15:02:37 MESZ]
...
23:55:44 INFO loader :: Add: 6.000.000.000 Index (Batch: 62.790 / Avg: 26.152)
23:55:44 INFO loader :: Elapsed: 229.424,58 seconds [2020/08/23 23:55:44 MESZ]
...
07:56:25 INFO loader :: Add: 7.000.000.000 Index (Batch: 36.004 / Avg: 27.103)
07:56:25 INFO loader :: Elapsed: 258.265,50 seconds [2020/08/24 07:56:25 MESZ]
...
15:16:24 INFO loader :: Add: 8.000.000.000 Index (Batch: 33.847 / Avg: 28.103)
15:16:24 INFO loader :: Elapsed: 284.664,53 seconds [2020/08/24 15:16:24 MESZ]
...
23:47:53 INFO loader :: Add: 9.000.000.000 Index (Batch: 56.129 / Avg: 28.539)
23:47:53 INFO loader :: Elapsed: 315.353,50 seconds [2020/08/24 23:47:53 MESZ]
...
07:46:36 INFO loader :: Add: 10.000.000.000 Index (Batch: 30.033 / Avg: 29.063)
07:46:36 INFO loader :: Elapsed: 344.076,53 seconds [2020/08/25 07:46:36 MESZ]
...
16:00:04 INFO loader :: Add: 11.000.000.000 Index (Batch: 55.282 / Avg: 29.436)
16:00:04 INFO loader :: Elapsed: 373.684,63 seconds [2020/08/25 16:00:04 MESZ]
...
01:35:49 INFO loader :: Add: 12.000.000.000 Index (Batch: 36.424 / Avg: 29.395)
01:35:50 INFO loader :: Elapsed: 408.230,25 seconds [2020/08/26 01:35:49 MESZ]
...
11:17:08 INFO loader :: Add: 13.000.000.000 Index (Batch: 26.727 / Avg: 29.338)
11:17:08 INFO loader :: Elapsed: 443.108,84 seconds [2020/08/26 11:17:08 MESZ]
...
16:55:00 INFO loader :: Add: 13.620.000.000 Index (Batch: 37.603 / Avg: 29.392)
16:55:00 INFO loader :: Elapsed: 463.380,28 seconds [2020/08/26 16:55:00 MESZ]
...
17:25:28 INFO loader :: Time = 855.042,638 seconds : Triples = 13.854.100.336 : Rate = 16.203 /s
disk usage
At 13.6 billion indexed triples:
du -sm *
2002139 data
0 tmp
When finished:
du -sm data/
2003251 data/
Scripts
wikidata2jena
#!/bin/bash
# WF 2020-05-10
# global settings
jena=apache-jena-3.16.0
tgz=$jena.tar.gz
jenaurl=http://mirror.easyname.ch/apache/jena/binaries/$tgz
base=/Volumes/Torterra/wikidata2020-08-15
data=$base/data
tdbloader=$jena/bin/tdb2.tdbloader
getjena() {
# download
if [ ! -f $tgz ]
then
echo "downloading $tgz from $jenaurl"
wget $jenaurl
else
echo "$tgz already downloaded"
fi
# unpack
if [ ! -d $jena ]
then
echo "unpacking $jena from $tgz"
tar xvzf $tgz
else
echo "$jena already unpacked"
fi
# create data directory
if [ ! -d $data ]
then
echo "creating $data directory"
mkdir -p $data
else
echo "$data directory already created"
fi
}
#
# show the given timestamp
#
timestamp() {
local msg="$1"
local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
echo "$msg at $ts"
}
#
# load data for the given data dir and input
#
loaddata() {
local data="$1"
local input="$2"
timestamp "start loading $input to $data"
$tdbloader --loc "$data" "$input" > tdb2-out.log 2> tdb2-err.log
timestamp "finished loading $input to $data"
}
getjena
export TMPDIR=$base/tmp
if [ ! -d $TMPDIR ]
then
echo "creating temporary directory $TMPDIR"
mkdir $TMPDIR
else
echo "using temporary directory $TMPDIR"
fi
loaddata $data latest-all.nt
check extract speed
./speed
149 MB in 3 s = 49 MB/s = 172 GB/h
315/2014 GB (15 %) todo: 1699 GB
ETA 9.9 h
speed
#!/bin/bash
# WF 2020-08-16
# check extract speed
#
# check the speed of extraction
#
checkSpeed() {
local l_secs="$1"
local l_total="$2"
totalgb=$(expr $l_total / 1024 / 1024 / 1024)
first=$(du -sm latest-all.nt | cut -f1)
sleep $l_secs
second=$(du -sm latest-all.nt | cut -f1)
mbytes=$(expr $second - $first)
mpersec=$(expr $mbytes / $l_secs)
gperh=$(expr $mpersec \* 3600 / 1024)
echo "$mbytes MB in $l_secs s = $mpersec MB/s = $gperh GB/h"
donegb=$(expr $second / 1024)
todogb=$(expr $totalgb - $donegb)
percent=$(expr $donegb \* 100 / $totalgb)
eta=$(expr $todogb / $gperh)
echo "$donegb/$totalgb GB ($percent %) todo: $todogb GB"
awk -v todo=$todogb -v gperh=$gperh 'BEGIN { printf("ETA %.1f h\n",todo/gperh) }'
}
checkSpeed 3 2162713035569