Difference between revisions of "WikiData Import 2020-07-30"

From BITPlan Wiki
Jump to navigation Jump to search
 
(49 intermediate revisions by the same user not shown)
Line 1: Line 1:
 +
see also {{Link|target=Get_your_own_copy_of_WikiData}} - this was the sixth attempt after four failures and one success with a truth date set. The main success hope was to use a 4 TB SSD disk which
 +
was kindly supplied by the [https://projects.tib.eu/en/confident/ ConfIDent project]
 +
= Environment =
 +
# Mac Pro Mid 2010
 +
# 12 core 3.46 GHz
 +
# 64 GB RAM
 +
# macOS High Sierra 10.13.6
 +
# Source Disk: 4 TB 7200 rpm hard disk WD Gold WDC WD4002FYYZ Blackmagic speed rating: 175 MB/s write 175 MB/s read
 +
# Target Disk: 4 TB SSD Samsung 860 EVO Blackmagic speed rating: 257 MB/s write 270 MB/s read
 +
# <source lang='bash' highlight='1'>
 +
java -version
 +
openjdk version "11.0.5" 2019-10-15
 +
OpenJDK Runtime Environment AdoptOpenJDK (build 11.0.5+10)
 +
OpenJDK 64-Bit Server VM AdoptOpenJDK (build 11.0.5+10, mixed mode)
 +
</source>
 +
= Summary =
 +
Import failed due to power outage at some 96% of the import after some 14 days
 +
# trying to replicate success story of https://issues.apache.org/jira/projects/JENA/issues/JENA-1909
 +
# download of 110 GB took some 6 h 30 min
 +
# unzipping to some 2160 GB took more than 1/2 day
 +
# import of some 13 billion triples was almost finished at 13.2 billion triples after some 14 days when there was a thunder storm - a half second power outage - the UPS kicked in but could not prevent the server from halting leaving the import incomplete shortly before it would have been finished ...
 +
 
= Download and unpack =
 
= Download and unpack =
 
This download was done with the "latest-all.nt" dataset.
 
This download was done with the "latest-all.nt" dataset.
Line 20: Line 42:
 
# retry next morning
 
# retry next morning
 
nohup bzip2 -ckd latest-all.nt.bz2 > /Volumes/Torterra/wikidata2020-07-31/latest-all.nt&
 
nohup bzip2 -ckd latest-all.nt.bz2 > /Volumes/Torterra/wikidata2020-07-31/latest-all.nt&
 +
...
 +
ls -l latest-all.nt
 +
-rw-r--r--  1 wf  admin  2162713035569 Jul 31 22:24 latest-all.nt
 
</source>
 
</source>
 
== bzip2 issue ==
 
== bzip2 issue ==
Line 29: Line 54:
 
bzip2: Deleting output file latest-all.nt, if it exists.
 
bzip2: Deleting output file latest-all.nt, if it exists.
 
</source>
 
</source>
 +
 
= Counting an copying =
 
= Counting an copying =
 
<source lang='bash'>
 
<source lang='bash'>
Line 39: Line 65:
 
Sat Aug  1 10:52:57 CEST 2020
 
Sat Aug  1 10:52:57 CEST 2020
 
</source>
 
</source>
 +
= Start and progress =
 +
 +
<source  lang='bash' highlight='1-2'>
 +
nohup ./wikidata2jena&
 +
tail -f tdb2-err.log
 +
13:54:03 INFO  loader          :: Add: 25.000.000 latest-all.nt (Batch: 103.369 / Avg: 104.489)
 +
13:54:03 INFO  loader          ::  Elapsed: 239,26 seconds [2020/08/01 13:54:03 MESZ]
 +
...
 +
19:27:43 INFO  loader          :: Add: 1.000.000.000 latest-all.nt (Batch: 17.564 / Avg: 49.359)
 +
19:27:43 INFO  loader          ::  Elapsed: 20.259,39 seconds [2020/08/01 19:27:43 MESZ]
 +
...
 +
06:53:39 INFO  loader          :: Add: 2.000.000.000 latest-all.nt (Batch: 27.135 / Avg: 32.564)
 +
06:53:39 INFO  loader          ::  Elapsed: 61.415,66 seconds [2020/08/02 06:53:39 MESZ]
 +
...
 +
16:46:07 INFO  loader          :: Add: 3.000.000.000 latest-all.nt (Batch: 40.943 / Avg: 30.939)
 +
16:46:07 INFO  loader          ::  Elapsed: 96.963,08 seconds [2020/08/02 16:46:07 MESZ]
 +
...
 +
02:47:19 INFO  loader          :: Add: 4.000.000.000 latest-all.nt (Batch: 18.551 / Avg: 30.067)
 +
02:47:19 INFO  loader          ::  Elapsed: 133.034,73 seconds [2020/08/03 02:47:19 MESZ]
 +
...
 +
17:43:29 INFO  loader          :: Add: 5.000.000.000 latest-all.nt (Batch: 11.246 / Avg: 26.765)
 +
17:43:29 INFO  loader          ::  Elapsed: 186.805,39 seconds [2020/08/03 17:43:29 MESZ]
 +
...
 +
13:12:48 INFO  loader          :: Add: 6.000.000.000 latest-all.nt (Batch: 7.488 / Avg: 23.349)
 +
13:12:48 INFO  loader          ::  Elapsed: 256.964,05 seconds [2020/08/04 13:12:48 MESZ]
 +
...
 +
11:41:39 INFO  loader          :: Add: 7.000.000.000 latest-all.nt (Batch: 11.798 / Avg: 20.716)
 +
11:41:39 INFO  loader          ::  Elapsed: 337.895,22 seconds [2020/08/05 11:41:39 MESZ]
 +
...
 +
21:02:39 INFO  loader          :: Add: 8.000.000.000 latest-all.nt (Batch: 7.732 / Avg: 17.468)
 +
21:02:39 INFO  loader          ::  Elapsed: 457.955,13 seconds [2020/08/06 21:02:39 MESZ]
 +
...
 +
21:04:16 INFO  loader          :: Add: 9.000.000.000 latest-all.nt (Batch: 9.469 / Avg: 16.530)
 +
21:04:16 INFO  loader          ::  Elapsed: 544.452,69 seconds [2020/08/07 21:04:16 MESZ]
 +
...
 +
14:48:25 INFO  loader          :: Add: 10.000.000.000 latest-all.nt (Batch: 14.865 / Avg: 14.394)
 +
14:48:25 INFO  loader          ::  Elapsed: 694.700,94 seconds [2020/08/09 14:48:25 MESZ]
 +
...
 +
02:34:34 INFO  loader          :: Add: 11.000.000.000 latest-all.nt (Batch: 6.112 / Avg: 13.358)
 +
02:34:37 INFO  loader          ::  Elapsed: 823.470,00 seconds [2020/08/11 02:34:34 MESZ]
 +
...
 +
22:43:55 INFO  loader          :: Add: 12.000.000.000 latest-all.nt (Batch: 8.038 / Avg: 12.214)
 +
22:43:55 INFO  loader          ::  Elapsed: 982.431,06 seconds [2020/08/12 22:43:55 MESZ]
 +
...
 +
07:24:11 INFO  loader          :: Add: 12.695.000.000 latest-all.nt (Batch: 4.670 / Avg: 11.540)
 +
07:24:11 INFO  loader          ::  Elapsed: 1.100.047,38 seconds [2020/08/14 07:24:11 MESZ]
 +
...
 +
16:21:21 INFO  loader          :: Add: 13.210.000.000 latest-all.nt (Batch: 7.249 / Avg: 10.839)
 +
16:21:21 INFO  loader          ::  Elapsed: 1.218.677,50 seconds [2020/08/15 16:21:21 MESZ]
 +
...
 +
</source>
 +
 +
= Scripts =
 +
== wikidata2jena ==
 +
<source lang='bash'>
 +
#!/bin/bash
 +
# WF 2020-05-10
 +
 +
# global settings
 +
jena=apache-jena-3.16.0
 +
tgz=$jena.tar.gz
 +
jenaurl=http://mirror.easyname.ch/apache/jena/binaries/$tgz
 +
base=/Volumes/Torterra/wikidata2020-08-01
 +
data=$base/data
 +
tdbloader=$jena/bin/tdb2.tdbloader
  
 +
getjena() {
 +
# download
 +
if [ ! -f $tgz ]
 +
then
 +
  echo "downloading $tgz from $jenaurl"
 +
wget $jenaurl
 +
else
 +
  echo "$tgz already downloaded"
 +
fi
 +
# unpack
 +
if [ ! -d $jena ]
 +
then
 +
  echo "unpacking $jena from $tgz"
 +
tar xvzf $tgz
 +
else
 +
  echo "$jena already unpacked"
 +
fi
 +
# create data directory
 +
if [ ! -d $data ]
 +
then
 +
  echo "creating $data directory"
 +
  mkdir -p $data
 +
else
 +
  echo "$data directory already created"
 +
fi
 +
}
 +
 +
#
 +
# show the given timestamp
 +
#
 +
timestamp() {
 +
local msg="$1"
 +
local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
 +
echo "$msg at $ts"
 +
}
 +
 +
#
 +
# load data for the given data dir and input
 +
#
 +
loaddata() {
 +
local data="$1"
 +
local input="$2"
 +
  timestamp "start loading $input to $data"
 +
  $tdbloader --loader=parallel --loc "$data" "$input" > tdb2-out.log 2> tdb2-err.log
 +
timestamp "finished loading $input to $data"
 +
}
 +
 +
getjena
 +
export TMPDIR=$base/tmp
 +
if [ ! -d $TMPDIR ]
 +
then
 +
  echo "creating temporary directory $TMPDIR"
 +
  mkdir $TMPDIR
 +
else
 +
  echo "using temporary directory $TMPDIR"
 +
fi
 +
loaddata $data latest-all.nt
 +
</source>
 
[[Category:WikiData]]
 
[[Category:WikiData]]

Latest revision as of 06:52, 17 August 2020

see also Get_your_own_copy_of_WikiData - this was the sixth attempt after four failures and one success with a truth date set. The main success hope was to use a 4 TB SSD disk which was kindly supplied by the ConfIDent project

Environment

  1. Mac Pro Mid 2010
  2. 12 core 3.46 GHz
  3. 64 GB RAM
  4. macOS High Sierra 10.13.6
  5. Source Disk: 4 TB 7200 rpm hard disk WD Gold WDC WD4002FYYZ Blackmagic speed rating: 175 MB/s write 175 MB/s read
  6. Target Disk: 4 TB SSD Samsung 860 EVO Blackmagic speed rating: 257 MB/s write 270 MB/s read
  7. java -version
    openjdk version "11.0.5" 2019-10-15
    OpenJDK Runtime Environment AdoptOpenJDK (build 11.0.5+10)
    OpenJDK 64-Bit Server VM AdoptOpenJDK (build 11.0.5+10, mixed mode)
    

Summary

Import failed due to power outage at some 96% of the import after some 14 days

  1. trying to replicate success story of https://issues.apache.org/jira/projects/JENA/issues/JENA-1909
  2. download of 110 GB took some 6 h 30 min
  3. unzipping to some 2160 GB took more than 1/2 day
  4. import of some 13 billion triples was almost finished at 13.2 billion triples after some 14 days when there was a thunder storm - a half second power outage - the UPS kicked in but could not prevent the server from halting leaving the import incomplete shortly before it would have been finished ...

Download and unpack

This download was done with the "latest-all.nt" dataset.

wget https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2

--2020-07-30 06:40:18--  https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620::861:1:208:80:154:7, 208.80.154.7
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620::861:1:208:80:154:7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118776910150 (111G) [application/octet-stream]
Saving to: ‘latest-all.nt.bz2’
latest-all.nt.bz2     0%[                    ] 635.10M  5.02MB/s    eta 6h 19m
...
latest-all.nt.bz2   100%[===================>] 110.62G  4.88MB/s    in 6h 31m  
2020-07-30 13:11:49 (4.82 MB/s) - ‘latest-all.nt.bz2’ saved [118776910150/118776910150]
bzip2 -dk latest-all.nt.bz2
ls -l latest-all.nt
-rw-------  1 wf  admin  1980899328 Jul 30 17:56 latest-all.nt
# failed to limited disk space
# retry next morning
nohup bzip2 -ckd latest-all.nt.bz2 > /Volumes/Torterra/wikidata2020-07-31/latest-all.nt&
...
ls -l latest-all.nt 
-rw-r--r--  1 wf  admin  2162713035569 Jul 31 22:24 latest-all.nt

bzip2 issue

the first bzip failed - see retry for how bzip is used to extract from rotating disk to SSD

bzip2: I/O or other error, bailing out.  Possible reason follows.
bzip2: No space left on device
	Input file = latest-all.nt.bz2, output file = latest-all.nt
bzip2: Deleting output file latest-all.nt, if it exists.

Counting an copying

date;cp -p /Volumes/Torterra/wikidata2020-07-31/latest-all.nt .;date
Sat Aug  1 07:42:32 CEST 2020
Sat Aug  1 12:21:06 CEST 2020
date;wc -l latest-all.nt;date
Sat Aug  1 07:42:43 CEST 2020
 13738317356 latest-all.nt
Sat Aug  1 10:52:57 CEST 2020

Start and progress

nohup ./wikidata2jena&
tail -f tdb2-err.log 
13:54:03 INFO  loader          :: Add: 25.000.000 latest-all.nt (Batch: 103.369 / Avg: 104.489)
13:54:03 INFO  loader          ::   Elapsed: 239,26 seconds [2020/08/01 13:54:03 MESZ]
...
19:27:43 INFO  loader          :: Add: 1.000.000.000 latest-all.nt (Batch: 17.564 / Avg: 49.359)
19:27:43 INFO  loader          ::   Elapsed: 20.259,39 seconds [2020/08/01 19:27:43 MESZ]
...
06:53:39 INFO  loader          :: Add: 2.000.000.000 latest-all.nt (Batch: 27.135 / Avg: 32.564)
06:53:39 INFO  loader          ::   Elapsed: 61.415,66 seconds [2020/08/02 06:53:39 MESZ]
...
16:46:07 INFO  loader          :: Add: 3.000.000.000 latest-all.nt (Batch: 40.943 / Avg: 30.939)
16:46:07 INFO  loader          ::   Elapsed: 96.963,08 seconds [2020/08/02 16:46:07 MESZ]
...
02:47:19 INFO  loader          :: Add: 4.000.000.000 latest-all.nt (Batch: 18.551 / Avg: 30.067)
02:47:19 INFO  loader          ::   Elapsed: 133.034,73 seconds [2020/08/03 02:47:19 MESZ]
...
17:43:29 INFO  loader          :: Add: 5.000.000.000 latest-all.nt (Batch: 11.246 / Avg: 26.765)
17:43:29 INFO  loader          ::   Elapsed: 186.805,39 seconds [2020/08/03 17:43:29 MESZ]
...
13:12:48 INFO  loader          :: Add: 6.000.000.000 latest-all.nt (Batch: 7.488 / Avg: 23.349)
13:12:48 INFO  loader          ::   Elapsed: 256.964,05 seconds [2020/08/04 13:12:48 MESZ]
...
11:41:39 INFO  loader          :: Add: 7.000.000.000 latest-all.nt (Batch: 11.798 / Avg: 20.716)
11:41:39 INFO  loader          ::   Elapsed: 337.895,22 seconds [2020/08/05 11:41:39 MESZ]
...
21:02:39 INFO  loader          :: Add: 8.000.000.000 latest-all.nt (Batch: 7.732 / Avg: 17.468)
21:02:39 INFO  loader          ::   Elapsed: 457.955,13 seconds [2020/08/06 21:02:39 MESZ]
...
21:04:16 INFO  loader          :: Add: 9.000.000.000 latest-all.nt (Batch: 9.469 / Avg: 16.530)
21:04:16 INFO  loader          ::   Elapsed: 544.452,69 seconds [2020/08/07 21:04:16 MESZ]
...
14:48:25 INFO  loader          :: Add: 10.000.000.000 latest-all.nt (Batch: 14.865 / Avg: 14.394)
14:48:25 INFO  loader          ::   Elapsed: 694.700,94 seconds [2020/08/09 14:48:25 MESZ]
...
02:34:34 INFO  loader          :: Add: 11.000.000.000 latest-all.nt (Batch: 6.112 / Avg: 13.358)
02:34:37 INFO  loader          ::   Elapsed: 823.470,00 seconds [2020/08/11 02:34:34 MESZ]
...
22:43:55 INFO  loader          :: Add: 12.000.000.000 latest-all.nt (Batch: 8.038 / Avg: 12.214)
22:43:55 INFO  loader          ::   Elapsed: 982.431,06 seconds [2020/08/12 22:43:55 MESZ]
...
07:24:11 INFO  loader          :: Add: 12.695.000.000 latest-all.nt (Batch: 4.670 / Avg: 11.540)
07:24:11 INFO  loader          ::   Elapsed: 1.100.047,38 seconds [2020/08/14 07:24:11 MESZ]
...
16:21:21 INFO  loader          :: Add: 13.210.000.000 latest-all.nt (Batch: 7.249 / Avg: 10.839)
16:21:21 INFO  loader          ::   Elapsed: 1.218.677,50 seconds [2020/08/15 16:21:21 MESZ]
...

Scripts

wikidata2jena

#!/bin/bash
# WF 2020-05-10

# global settings
jena=apache-jena-3.16.0
tgz=$jena.tar.gz
jenaurl=http://mirror.easyname.ch/apache/jena/binaries/$tgz
base=/Volumes/Torterra/wikidata2020-08-01
data=$base/data
tdbloader=$jena/bin/tdb2.tdbloader

getjena() {
# download
if [ ! -f $tgz ]
then
  echo "downloading $tgz from $jenaurl"
	wget $jenaurl
else
  echo "$tgz already downloaded"
fi
# unpack
if [ ! -d $jena ]
then
  echo "unpacking $jena from $tgz"
	tar xvzf $tgz
else
  echo "$jena already unpacked"
fi
# create data directory
if [ ! -d $data ]
then
  echo "creating $data directory"
  mkdir -p $data
else
  echo "$data directory already created"
fi
}

#
# show the given timestamp
#
timestamp() {
 local msg="$1"
 local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
 echo "$msg at $ts"
}

#
# load data for the given data dir and input
#
loaddata() {
	local data="$1"
	local input="$2"
  timestamp "start loading $input to $data"
  $tdbloader --loader=parallel --loc "$data" "$input" > tdb2-out.log 2> tdb2-err.log
	timestamp "finished loading $input to $data"
}

getjena
export TMPDIR=$base/tmp
if [ ! -d $TMPDIR ]
then
  echo "creating temporary directory $TMPDIR"
  mkdir $TMPDIR
else
  echo "using temporary directory $TMPDIR"
fi
loaddata $data latest-all.nt