Difference between revisions of "WikiData Import 2020-07-30"
Jump to navigation
Jump to search
(49 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
+ | see also {{Link|target=Get_your_own_copy_of_WikiData}} - this was the sixth attempt after four failures and one success with a truth date set. The main success hope was to use a 4 TB SSD disk which | ||
+ | was kindly supplied by the [https://projects.tib.eu/en/confident/ ConfIDent project] | ||
+ | = Environment = | ||
+ | # Mac Pro Mid 2010 | ||
+ | # 12 core 3.46 GHz | ||
+ | # 64 GB RAM | ||
+ | # macOS High Sierra 10.13.6 | ||
+ | # Source Disk: 4 TB 7200 rpm hard disk WD Gold WDC WD4002FYYZ Blackmagic speed rating: 175 MB/s write 175 MB/s read | ||
+ | # Target Disk: 4 TB SSD Samsung 860 EVO Blackmagic speed rating: 257 MB/s write 270 MB/s read | ||
+ | # <source lang='bash' highlight='1'> | ||
+ | java -version | ||
+ | openjdk version "11.0.5" 2019-10-15 | ||
+ | OpenJDK Runtime Environment AdoptOpenJDK (build 11.0.5+10) | ||
+ | OpenJDK 64-Bit Server VM AdoptOpenJDK (build 11.0.5+10, mixed mode) | ||
+ | </source> | ||
+ | = Summary = | ||
+ | Import failed due to power outage at some 96% of the import after some 14 days | ||
+ | # trying to replicate success story of https://issues.apache.org/jira/projects/JENA/issues/JENA-1909 | ||
+ | # download of 110 GB took some 6 h 30 min | ||
+ | # unzipping to some 2160 GB took more than 1/2 day | ||
+ | # import of some 13 billion triples was almost finished at 13.2 billion triples after some 14 days when there was a thunder storm - a half second power outage - the UPS kicked in but could not prevent the server from halting leaving the import incomplete shortly before it would have been finished ... | ||
+ | |||
= Download and unpack = | = Download and unpack = | ||
This download was done with the "latest-all.nt" dataset. | This download was done with the "latest-all.nt" dataset. | ||
Line 20: | Line 42: | ||
# retry next morning | # retry next morning | ||
nohup bzip2 -ckd latest-all.nt.bz2 > /Volumes/Torterra/wikidata2020-07-31/latest-all.nt& | nohup bzip2 -ckd latest-all.nt.bz2 > /Volumes/Torterra/wikidata2020-07-31/latest-all.nt& | ||
+ | ... | ||
+ | ls -l latest-all.nt | ||
+ | -rw-r--r-- 1 wf admin 2162713035569 Jul 31 22:24 latest-all.nt | ||
</source> | </source> | ||
== bzip2 issue == | == bzip2 issue == | ||
Line 29: | Line 54: | ||
bzip2: Deleting output file latest-all.nt, if it exists. | bzip2: Deleting output file latest-all.nt, if it exists. | ||
</source> | </source> | ||
+ | |||
= Counting an copying = | = Counting an copying = | ||
<source lang='bash'> | <source lang='bash'> | ||
Line 39: | Line 65: | ||
Sat Aug 1 10:52:57 CEST 2020 | Sat Aug 1 10:52:57 CEST 2020 | ||
</source> | </source> | ||
+ | = Start and progress = | ||
+ | |||
+ | <source lang='bash' highlight='1-2'> | ||
+ | nohup ./wikidata2jena& | ||
+ | tail -f tdb2-err.log | ||
+ | 13:54:03 INFO loader :: Add: 25.000.000 latest-all.nt (Batch: 103.369 / Avg: 104.489) | ||
+ | 13:54:03 INFO loader :: Elapsed: 239,26 seconds [2020/08/01 13:54:03 MESZ] | ||
+ | ... | ||
+ | 19:27:43 INFO loader :: Add: 1.000.000.000 latest-all.nt (Batch: 17.564 / Avg: 49.359) | ||
+ | 19:27:43 INFO loader :: Elapsed: 20.259,39 seconds [2020/08/01 19:27:43 MESZ] | ||
+ | ... | ||
+ | 06:53:39 INFO loader :: Add: 2.000.000.000 latest-all.nt (Batch: 27.135 / Avg: 32.564) | ||
+ | 06:53:39 INFO loader :: Elapsed: 61.415,66 seconds [2020/08/02 06:53:39 MESZ] | ||
+ | ... | ||
+ | 16:46:07 INFO loader :: Add: 3.000.000.000 latest-all.nt (Batch: 40.943 / Avg: 30.939) | ||
+ | 16:46:07 INFO loader :: Elapsed: 96.963,08 seconds [2020/08/02 16:46:07 MESZ] | ||
+ | ... | ||
+ | 02:47:19 INFO loader :: Add: 4.000.000.000 latest-all.nt (Batch: 18.551 / Avg: 30.067) | ||
+ | 02:47:19 INFO loader :: Elapsed: 133.034,73 seconds [2020/08/03 02:47:19 MESZ] | ||
+ | ... | ||
+ | 17:43:29 INFO loader :: Add: 5.000.000.000 latest-all.nt (Batch: 11.246 / Avg: 26.765) | ||
+ | 17:43:29 INFO loader :: Elapsed: 186.805,39 seconds [2020/08/03 17:43:29 MESZ] | ||
+ | ... | ||
+ | 13:12:48 INFO loader :: Add: 6.000.000.000 latest-all.nt (Batch: 7.488 / Avg: 23.349) | ||
+ | 13:12:48 INFO loader :: Elapsed: 256.964,05 seconds [2020/08/04 13:12:48 MESZ] | ||
+ | ... | ||
+ | 11:41:39 INFO loader :: Add: 7.000.000.000 latest-all.nt (Batch: 11.798 / Avg: 20.716) | ||
+ | 11:41:39 INFO loader :: Elapsed: 337.895,22 seconds [2020/08/05 11:41:39 MESZ] | ||
+ | ... | ||
+ | 21:02:39 INFO loader :: Add: 8.000.000.000 latest-all.nt (Batch: 7.732 / Avg: 17.468) | ||
+ | 21:02:39 INFO loader :: Elapsed: 457.955,13 seconds [2020/08/06 21:02:39 MESZ] | ||
+ | ... | ||
+ | 21:04:16 INFO loader :: Add: 9.000.000.000 latest-all.nt (Batch: 9.469 / Avg: 16.530) | ||
+ | 21:04:16 INFO loader :: Elapsed: 544.452,69 seconds [2020/08/07 21:04:16 MESZ] | ||
+ | ... | ||
+ | 14:48:25 INFO loader :: Add: 10.000.000.000 latest-all.nt (Batch: 14.865 / Avg: 14.394) | ||
+ | 14:48:25 INFO loader :: Elapsed: 694.700,94 seconds [2020/08/09 14:48:25 MESZ] | ||
+ | ... | ||
+ | 02:34:34 INFO loader :: Add: 11.000.000.000 latest-all.nt (Batch: 6.112 / Avg: 13.358) | ||
+ | 02:34:37 INFO loader :: Elapsed: 823.470,00 seconds [2020/08/11 02:34:34 MESZ] | ||
+ | ... | ||
+ | 22:43:55 INFO loader :: Add: 12.000.000.000 latest-all.nt (Batch: 8.038 / Avg: 12.214) | ||
+ | 22:43:55 INFO loader :: Elapsed: 982.431,06 seconds [2020/08/12 22:43:55 MESZ] | ||
+ | ... | ||
+ | 07:24:11 INFO loader :: Add: 12.695.000.000 latest-all.nt (Batch: 4.670 / Avg: 11.540) | ||
+ | 07:24:11 INFO loader :: Elapsed: 1.100.047,38 seconds [2020/08/14 07:24:11 MESZ] | ||
+ | ... | ||
+ | 16:21:21 INFO loader :: Add: 13.210.000.000 latest-all.nt (Batch: 7.249 / Avg: 10.839) | ||
+ | 16:21:21 INFO loader :: Elapsed: 1.218.677,50 seconds [2020/08/15 16:21:21 MESZ] | ||
+ | ... | ||
+ | </source> | ||
+ | |||
+ | = Scripts = | ||
+ | == wikidata2jena == | ||
+ | <source lang='bash'> | ||
+ | #!/bin/bash | ||
+ | # WF 2020-05-10 | ||
+ | |||
+ | # global settings | ||
+ | jena=apache-jena-3.16.0 | ||
+ | tgz=$jena.tar.gz | ||
+ | jenaurl=http://mirror.easyname.ch/apache/jena/binaries/$tgz | ||
+ | base=/Volumes/Torterra/wikidata2020-08-01 | ||
+ | data=$base/data | ||
+ | tdbloader=$jena/bin/tdb2.tdbloader | ||
+ | getjena() { | ||
+ | # download | ||
+ | if [ ! -f $tgz ] | ||
+ | then | ||
+ | echo "downloading $tgz from $jenaurl" | ||
+ | wget $jenaurl | ||
+ | else | ||
+ | echo "$tgz already downloaded" | ||
+ | fi | ||
+ | # unpack | ||
+ | if [ ! -d $jena ] | ||
+ | then | ||
+ | echo "unpacking $jena from $tgz" | ||
+ | tar xvzf $tgz | ||
+ | else | ||
+ | echo "$jena already unpacked" | ||
+ | fi | ||
+ | # create data directory | ||
+ | if [ ! -d $data ] | ||
+ | then | ||
+ | echo "creating $data directory" | ||
+ | mkdir -p $data | ||
+ | else | ||
+ | echo "$data directory already created" | ||
+ | fi | ||
+ | } | ||
+ | |||
+ | # | ||
+ | # show the given timestamp | ||
+ | # | ||
+ | timestamp() { | ||
+ | local msg="$1" | ||
+ | local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ") | ||
+ | echo "$msg at $ts" | ||
+ | } | ||
+ | |||
+ | # | ||
+ | # load data for the given data dir and input | ||
+ | # | ||
+ | loaddata() { | ||
+ | local data="$1" | ||
+ | local input="$2" | ||
+ | timestamp "start loading $input to $data" | ||
+ | $tdbloader --loader=parallel --loc "$data" "$input" > tdb2-out.log 2> tdb2-err.log | ||
+ | timestamp "finished loading $input to $data" | ||
+ | } | ||
+ | |||
+ | getjena | ||
+ | export TMPDIR=$base/tmp | ||
+ | if [ ! -d $TMPDIR ] | ||
+ | then | ||
+ | echo "creating temporary directory $TMPDIR" | ||
+ | mkdir $TMPDIR | ||
+ | else | ||
+ | echo "using temporary directory $TMPDIR" | ||
+ | fi | ||
+ | loaddata $data latest-all.nt | ||
+ | </source> | ||
[[Category:WikiData]] | [[Category:WikiData]] |
Latest revision as of 06:52, 17 August 2020
see also Get_your_own_copy_of_WikiData - this was the sixth attempt after four failures and one success with a truth date set. The main success hope was to use a 4 TB SSD disk which was kindly supplied by the ConfIDent project
Environment
- Mac Pro Mid 2010
- 12 core 3.46 GHz
- 64 GB RAM
- macOS High Sierra 10.13.6
- Source Disk: 4 TB 7200 rpm hard disk WD Gold WDC WD4002FYYZ Blackmagic speed rating: 175 MB/s write 175 MB/s read
- Target Disk: 4 TB SSD Samsung 860 EVO Blackmagic speed rating: 257 MB/s write 270 MB/s read
java -version openjdk version "11.0.5" 2019-10-15 OpenJDK Runtime Environment AdoptOpenJDK (build 11.0.5+10) OpenJDK 64-Bit Server VM AdoptOpenJDK (build 11.0.5+10, mixed mode)
Summary
Import failed due to power outage at some 96% of the import after some 14 days
- trying to replicate success story of https://issues.apache.org/jira/projects/JENA/issues/JENA-1909
- download of 110 GB took some 6 h 30 min
- unzipping to some 2160 GB took more than 1/2 day
- import of some 13 billion triples was almost finished at 13.2 billion triples after some 14 days when there was a thunder storm - a half second power outage - the UPS kicked in but could not prevent the server from halting leaving the import incomplete shortly before it would have been finished ...
Download and unpack
This download was done with the "latest-all.nt" dataset.
wget https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2
--2020-07-30 06:40:18-- https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620::861:1:208:80:154:7, 208.80.154.7
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620::861:1:208:80:154:7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118776910150 (111G) [application/octet-stream]
Saving to: ‘latest-all.nt.bz2’
latest-all.nt.bz2 0%[ ] 635.10M 5.02MB/s eta 6h 19m
...
latest-all.nt.bz2 100%[===================>] 110.62G 4.88MB/s in 6h 31m
2020-07-30 13:11:49 (4.82 MB/s) - ‘latest-all.nt.bz2’ saved [118776910150/118776910150]
bzip2 -dk latest-all.nt.bz2
ls -l latest-all.nt
-rw------- 1 wf admin 1980899328 Jul 30 17:56 latest-all.nt
# failed to limited disk space
# retry next morning
nohup bzip2 -ckd latest-all.nt.bz2 > /Volumes/Torterra/wikidata2020-07-31/latest-all.nt&
...
ls -l latest-all.nt
-rw-r--r-- 1 wf admin 2162713035569 Jul 31 22:24 latest-all.nt
bzip2 issue
the first bzip failed - see retry for how bzip is used to extract from rotating disk to SSD
bzip2: I/O or other error, bailing out. Possible reason follows.
bzip2: No space left on device
Input file = latest-all.nt.bz2, output file = latest-all.nt
bzip2: Deleting output file latest-all.nt, if it exists.
Counting an copying
date;cp -p /Volumes/Torterra/wikidata2020-07-31/latest-all.nt .;date
Sat Aug 1 07:42:32 CEST 2020
Sat Aug 1 12:21:06 CEST 2020
date;wc -l latest-all.nt;date
Sat Aug 1 07:42:43 CEST 2020
13738317356 latest-all.nt
Sat Aug 1 10:52:57 CEST 2020
Start and progress
nohup ./wikidata2jena&
tail -f tdb2-err.log
13:54:03 INFO loader :: Add: 25.000.000 latest-all.nt (Batch: 103.369 / Avg: 104.489)
13:54:03 INFO loader :: Elapsed: 239,26 seconds [2020/08/01 13:54:03 MESZ]
...
19:27:43 INFO loader :: Add: 1.000.000.000 latest-all.nt (Batch: 17.564 / Avg: 49.359)
19:27:43 INFO loader :: Elapsed: 20.259,39 seconds [2020/08/01 19:27:43 MESZ]
...
06:53:39 INFO loader :: Add: 2.000.000.000 latest-all.nt (Batch: 27.135 / Avg: 32.564)
06:53:39 INFO loader :: Elapsed: 61.415,66 seconds [2020/08/02 06:53:39 MESZ]
...
16:46:07 INFO loader :: Add: 3.000.000.000 latest-all.nt (Batch: 40.943 / Avg: 30.939)
16:46:07 INFO loader :: Elapsed: 96.963,08 seconds [2020/08/02 16:46:07 MESZ]
...
02:47:19 INFO loader :: Add: 4.000.000.000 latest-all.nt (Batch: 18.551 / Avg: 30.067)
02:47:19 INFO loader :: Elapsed: 133.034,73 seconds [2020/08/03 02:47:19 MESZ]
...
17:43:29 INFO loader :: Add: 5.000.000.000 latest-all.nt (Batch: 11.246 / Avg: 26.765)
17:43:29 INFO loader :: Elapsed: 186.805,39 seconds [2020/08/03 17:43:29 MESZ]
...
13:12:48 INFO loader :: Add: 6.000.000.000 latest-all.nt (Batch: 7.488 / Avg: 23.349)
13:12:48 INFO loader :: Elapsed: 256.964,05 seconds [2020/08/04 13:12:48 MESZ]
...
11:41:39 INFO loader :: Add: 7.000.000.000 latest-all.nt (Batch: 11.798 / Avg: 20.716)
11:41:39 INFO loader :: Elapsed: 337.895,22 seconds [2020/08/05 11:41:39 MESZ]
...
21:02:39 INFO loader :: Add: 8.000.000.000 latest-all.nt (Batch: 7.732 / Avg: 17.468)
21:02:39 INFO loader :: Elapsed: 457.955,13 seconds [2020/08/06 21:02:39 MESZ]
...
21:04:16 INFO loader :: Add: 9.000.000.000 latest-all.nt (Batch: 9.469 / Avg: 16.530)
21:04:16 INFO loader :: Elapsed: 544.452,69 seconds [2020/08/07 21:04:16 MESZ]
...
14:48:25 INFO loader :: Add: 10.000.000.000 latest-all.nt (Batch: 14.865 / Avg: 14.394)
14:48:25 INFO loader :: Elapsed: 694.700,94 seconds [2020/08/09 14:48:25 MESZ]
...
02:34:34 INFO loader :: Add: 11.000.000.000 latest-all.nt (Batch: 6.112 / Avg: 13.358)
02:34:37 INFO loader :: Elapsed: 823.470,00 seconds [2020/08/11 02:34:34 MESZ]
...
22:43:55 INFO loader :: Add: 12.000.000.000 latest-all.nt (Batch: 8.038 / Avg: 12.214)
22:43:55 INFO loader :: Elapsed: 982.431,06 seconds [2020/08/12 22:43:55 MESZ]
...
07:24:11 INFO loader :: Add: 12.695.000.000 latest-all.nt (Batch: 4.670 / Avg: 11.540)
07:24:11 INFO loader :: Elapsed: 1.100.047,38 seconds [2020/08/14 07:24:11 MESZ]
...
16:21:21 INFO loader :: Add: 13.210.000.000 latest-all.nt (Batch: 7.249 / Avg: 10.839)
16:21:21 INFO loader :: Elapsed: 1.218.677,50 seconds [2020/08/15 16:21:21 MESZ]
...
Scripts
wikidata2jena
#!/bin/bash
# WF 2020-05-10
# global settings
jena=apache-jena-3.16.0
tgz=$jena.tar.gz
jenaurl=http://mirror.easyname.ch/apache/jena/binaries/$tgz
base=/Volumes/Torterra/wikidata2020-08-01
data=$base/data
tdbloader=$jena/bin/tdb2.tdbloader
getjena() {
# download
if [ ! -f $tgz ]
then
echo "downloading $tgz from $jenaurl"
wget $jenaurl
else
echo "$tgz already downloaded"
fi
# unpack
if [ ! -d $jena ]
then
echo "unpacking $jena from $tgz"
tar xvzf $tgz
else
echo "$jena already unpacked"
fi
# create data directory
if [ ! -d $data ]
then
echo "creating $data directory"
mkdir -p $data
else
echo "$data directory already created"
fi
}
#
# show the given timestamp
#
timestamp() {
local msg="$1"
local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
echo "$msg at $ts"
}
#
# load data for the given data dir and input
#
loaddata() {
local data="$1"
local input="$2"
timestamp "start loading $input to $data"
$tdbloader --loader=parallel --loc "$data" "$input" > tdb2-out.log 2> tdb2-err.log
timestamp "finished loading $input to $data"
}
getjena
export TMPDIR=$base/tmp
if [ ! -d $TMPDIR ]
then
echo "creating temporary directory $TMPDIR"
mkdir $TMPDIR
else
echo "using temporary directory $TMPDIR"
fi
loaddata $data latest-all.nt