Difference between revisions of "WikiData Import 2020-07-15"

From BITPlan Wiki
Jump to navigation Jump to search
Line 60: Line 60:
 
11:08:59 INFO  loader          :: Finish - index OSP
 
11:08:59 INFO  loader          :: Finish - index OSP
 
11:08:59 INFO  loader          :: Time = 395.522,378 seconds : Triples = 5.253.753.313 : Rate = 13.283 /s
 
11:08:59 INFO  loader          :: Time = 395.522,378 seconds : Triples = 5.253.753.313 : Rate = 13.283 /s
 +
</source>
 +
= Scripts =
 +
== wikidata2jena ==
 +
<source lang='bash'>
 +
#!/bin/bash
 +
# WF 2020-05-10
 +
 +
# global settings
 +
jena=apache-jena-3.16.0
 +
tgz=$jena.tar.gz
 +
jenaurl=http://mirror.easyname.ch/apache/jena/binaries/$tgz
 +
base=/Volumes/Torterra/wikidata
 +
data=$base/data
 +
tdbloader=$jena/bin/tdb2.tdbloader
 +
 +
getjena() {
 +
# download
 +
if [ ! -f $tgz ]
 +
then
 +
  echo "downloading $tgz from $jenaurl"
 +
wget $jenaurl
 +
else
 +
  echo "$tgz already downloaded"
 +
fi
 +
# unpack
 +
if [ ! -d $jena ]
 +
then
 +
  echo "unpacking $jena from $tgz"
 +
tar xvzf $tgz
 +
else
 +
  echo "$jena already unpacked"
 +
fi
 +
# create data directory
 +
if [ ! -d $data ]
 +
then
 +
  echo "creating $data directory"
 +
  mkdir -p $data
 +
else
 +
  echo "$data directory already created"
 +
fi
 +
}
 +
 +
#
 +
# show the given timestamp
 +
#
 +
timestamp() {
 +
local msg="$1"
 +
local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
 +
echo "$msg at $ts"
 +
}
 +
 +
#
 +
# load data for the given data dir and input
 +
#
 +
loaddata() {
 +
local data="$1"
 +
local input="$2"
 +
  timestamp "start loading $input to $data"
 +
  $tdbloader --loader=parallel --loc "$data" "$input" > tdb2-out.log 2> tdb2-err.log
 +
timestamp "finished loading $input to $data"
 +
}
 +
 +
getjena
 +
export TMPDIR=$base/tmp
 +
if [ ! -d $TMPDIR ]
 +
then
 +
  echo "creating temporary directory $TMPDIR"
 +
  mkdir $TMPDIR
 +
else
 +
  echo "using temporary directory $TMPDIR"
 +
fi
 +
loaddata $data latest-truthy.nt
 
</source>
 
</source>

Revision as of 07:19, 26 July 2020

  1. added 4 TB SSD Samsung 860 EVO
  2. trying to replicate success story of https://issues.apache.org/jira/projects/JENA/issues/JENA-1909
  3. again trying truthy only but might retry later with "all"
  4. download of 24 G took some 90 min
  5. unzipping to to some 670 G took some 4 h 30 min

Download and unpack

This download was done with the "truthy" dataset.

wget https://dumps.wikimedia.org/wikidatawiki/entities/latest-truthy.nt.bz2
--2020-07-15 15:24:25--  https://dumps.wikimedia.org/wikidatawiki/entities/latest-truthy.nt.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620::861:1:208:80:154:7, 208.80.154.7
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620::861:1:208:80:154:7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 25868964531 (24G) [application/octet-stream]
Saving to: ‘latest-truthy.nt.bz2’
latest-truthy.nt.bz   2%[                    ] 546.01M  4.66MB/s    eta 85m 57s
ls -l latest-truthy.nt.bz2 
-rw-r--r--  1 wf  admin  25868964531 Jul 11 23:33 latest-truthy.nt.bz2
bzip2 -dk latest-truthy.nt.bz2
ls -l latest-truthy.nt
-rw-------  1 wf  admin  671598919680 Jul 15 21:15 latest-truthy.nt
zeus:wikidata wf$ls -l latest-truthy.nt
-rw-r--r--  1 wf  admin  671749317281 Jul 11 23:33 latest-truthy.nt
start and progress
nohup ./wikidata2jena&
tail -f tdb2-err.log 
21:16:57 INFO  loader          :: Loader = LoaderParallel
21:16:57 INFO  loader          :: Start: latest-truthy.nt
21:17:00 INFO  loader          :: Add: 500.000 latest-truthy.nt (Batch: 151.883 / Avg: 151.883)
21:17:02 INFO  loader          :: Add: 1.000.000 latest-truthy.nt (Batch: 243.309 / Avg: 187.020)
...
21:33:21 INFO  loader          :: Add: 100.000.000 latest-truthy.nt (Batch: 209.292 / Avg: 101.592)
21:33:21 INFO  loader          ::   Elapsed: 984,33 seconds [2020/07/15 21:33:21 MESZ]
...
22:41:36 INFO  loader          :: Add: 500.000.000 latest-truthy.nt (Batch: 54.153 / Avg: 98.446)
22:41:36 INFO  loader          ::   Elapsed: 5.078,89 seconds [2020/07/15 22:41:36 MESZ]
...
02:55:36 INFO  loader          :: Add: 1.000.000.000 latest-truthy.nt (Batch: 21.504 / Avg: 49.215)
02:55:36 INFO  loader          ::   Elapsed: 20.318,94 seconds [2020/07/16 02:55:36 MESZ]
...
13:47:17 INFO  loader          :: Add: 2.000.000.000 latest-truthy.nt (Batch: 32.036 / Avg: 33.658)
13:47:17 INFO  loader          ::   Elapsed: 59.420,03 seconds [2020/07/16 13:47:17 MESZ]
...
06:10:12 INFO  loader          :: Add: 3.000.000.000 latest-truthy.nt (Batch: 10.900 / Avg: 25.338)
06:10:13 INFO  loader          ::   Elapsed: 118.395,31 seconds [2020/07/17 06:10:12 MESZ]
...
09:33:14 INFO  loader          :: Add: 4.000.000.000 latest-truthy.nt (Batch: 11.790 / Avg: 18.435)
09:33:14 INFO  loader          ::   Elapsed: 216.976,77 seconds [2020/07/18 09:33:14 MESZ]
...
00:55:21 INFO  loader          :: Add: 5.000.000.000 latest-truthy.nt (Batch: 4.551 / Avg: 13.939)
00:55:21 INFO  loader          ::   Elapsed: 358.703,75 seconds [2020/07/20 00:55:21 MESZ]
...
11:02:06 INFO  loader          :: Add: 5.253.500.000 latest-truthy.nt (Batch: 10.555 / Avg: 13.296)
11:02:38 INFO  loader          :: Finished: latest-truthy.nt: 5.253.753.313 tuples in 395140,91s (Avg: 13.295)
11:05:27 INFO  loader          :: Finish - index SPO
11:08:24 INFO  loader          :: Finish - index POS
11:08:59 INFO  loader          :: Finish - index OSP
11:08:59 INFO  loader          :: Time = 395.522,378 seconds : Triples = 5.253.753.313 : Rate = 13.283 /s

Scripts

wikidata2jena

#!/bin/bash
# WF 2020-05-10

# global settings
jena=apache-jena-3.16.0
tgz=$jena.tar.gz
jenaurl=http://mirror.easyname.ch/apache/jena/binaries/$tgz
base=/Volumes/Torterra/wikidata
data=$base/data
tdbloader=$jena/bin/tdb2.tdbloader

getjena() {
# download
if [ ! -f $tgz ]
then
  echo "downloading $tgz from $jenaurl"
	wget $jenaurl
else
  echo "$tgz already downloaded"
fi
# unpack
if [ ! -d $jena ]
then
  echo "unpacking $jena from $tgz"
	tar xvzf $tgz
else
  echo "$jena already unpacked"
fi
# create data directory
if [ ! -d $data ]
then
  echo "creating $data directory"
  mkdir -p $data
else
  echo "$data directory already created"
fi
}

#
# show the given timestamp
#
timestamp() {
 local msg="$1"
 local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
 echo "$msg at $ts"
}

#
# load data for the given data dir and input
#
loaddata() {
	local data="$1"
	local input="$2"
  timestamp "start loading $input to $data"
  $tdbloader --loader=parallel --loc "$data" "$input" > tdb2-out.log 2> tdb2-err.log
	timestamp "finished loading $input to $data"
}

getjena
export TMPDIR=$base/tmp
if [ ! -d $TMPDIR ]
then
  echo "creating temporary directory $TMPDIR"
  mkdir $TMPDIR
else
  echo "using temporary directory $TMPDIR"
fi
loaddata $data latest-truthy.nt