Difference between revisions of "WikiData Import 2020-07-30"

From BITPlan Wiki
Jump to navigation Jump to search
Line 39: Line 39:
 
Sat Aug  1 10:52:57 CEST 2020
 
Sat Aug  1 10:52:57 CEST 2020
 
</source>
 
</source>
 +
= Scripts =
 +
== wikidata2jena ==
 +
<source lang='bash'>
 +
#!/bin/bash
 +
# WF 2020-05-10
  
 +
# global settings
 +
jena=apache-jena-3.16.0
 +
tgz=$jena.tar.gz
 +
jenaurl=http://mirror.easyname.ch/apache/jena/binaries/$tgz
 +
base=/Volumes/Torterra/wikidata2020-08-01
 +
data=$base/data
 +
tdbloader=$jena/bin/tdb2.tdbloader
 +
 +
getjena() {
 +
# download
 +
if [ ! -f $tgz ]
 +
then
 +
  echo "downloading $tgz from $jenaurl"
 +
wget $jenaurl
 +
else
 +
  echo "$tgz already downloaded"
 +
fi
 +
# unpack
 +
if [ ! -d $jena ]
 +
then
 +
  echo "unpacking $jena from $tgz"
 +
tar xvzf $tgz
 +
else
 +
  echo "$jena already unpacked"
 +
fi
 +
# create data directory
 +
if [ ! -d $data ]
 +
then
 +
  echo "creating $data directory"
 +
  mkdir -p $data
 +
else
 +
  echo "$data directory already created"
 +
fi
 +
}
 +
 +
#
 +
# show the given timestamp
 +
#
 +
timestamp() {
 +
local msg="$1"
 +
local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
 +
echo "$msg at $ts"
 +
}
 +
 +
#
 +
# load data for the given data dir and input
 +
#
 +
loaddata() {
 +
local data="$1"
 +
local input="$2"
 +
  timestamp "start loading $input to $data"
 +
  $tdbloader --loader=parallel --loc "$data" "$input" > tdb2-out.log 2> tdb2-err.log
 +
timestamp "finished loading $input to $data"
 +
}
 +
 +
getjena
 +
export TMPDIR=$base/tmp
 +
if [ ! -d $TMPDIR ]
 +
then
 +
  echo "creating temporary directory $TMPDIR"
 +
  mkdir $TMPDIR
 +
else
 +
  echo "using temporary directory $TMPDIR"
 +
fi
 +
loaddata $data latest-all.nt
 +
</source>
 
[[Category:WikiData]]
 
[[Category:WikiData]]

Revision as of 13:44, 1 August 2020

Download and unpack

This download was done with the "latest-all.nt" dataset.

wget https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2

--2020-07-30 06:40:18--  https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620::861:1:208:80:154:7, 208.80.154.7
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620::861:1:208:80:154:7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118776910150 (111G) [application/octet-stream]
Saving to: ‘latest-all.nt.bz2’
latest-all.nt.bz2     0%[                    ] 635.10M  5.02MB/s    eta 6h 19m
...
latest-all.nt.bz2   100%[===================>] 110.62G  4.88MB/s    in 6h 31m  
2020-07-30 13:11:49 (4.82 MB/s) - ‘latest-all.nt.bz2’ saved [118776910150/118776910150]
bzip2 -dk latest-all.nt.bz2
ls -l latest-all.nt
-rw-------  1 wf  admin  1980899328 Jul 30 17:56 latest-all.nt
# failed to limited disk space
# retry next morning
nohup bzip2 -ckd latest-all.nt.bz2 > /Volumes/Torterra/wikidata2020-07-31/latest-all.nt&

bzip2 issue

the first bzip failed - see retry for how bzip is used to extract from rotating disk to SSD

bzip2: I/O or other error, bailing out.  Possible reason follows.
bzip2: No space left on device
	Input file = latest-all.nt.bz2, output file = latest-all.nt
bzip2: Deleting output file latest-all.nt, if it exists.

Counting an copying

date;cp -p /Volumes/Torterra/wikidata2020-07-31/latest-all.nt .;date
Sat Aug  1 07:42:32 CEST 2020
Sat Aug  1 12:21:06 CEST 2020
date;wc -l latest-all.nt;date
Sat Aug  1 07:42:43 CEST 2020
 13738317356 latest-all.nt
Sat Aug  1 10:52:57 CEST 2020

Scripts

wikidata2jena

#!/bin/bash
# WF 2020-05-10

# global settings
jena=apache-jena-3.16.0
tgz=$jena.tar.gz
jenaurl=http://mirror.easyname.ch/apache/jena/binaries/$tgz
base=/Volumes/Torterra/wikidata2020-08-01
data=$base/data
tdbloader=$jena/bin/tdb2.tdbloader

getjena() {
# download
if [ ! -f $tgz ]
then
  echo "downloading $tgz from $jenaurl"
	wget $jenaurl
else
  echo "$tgz already downloaded"
fi
# unpack
if [ ! -d $jena ]
then
  echo "unpacking $jena from $tgz"
	tar xvzf $tgz
else
  echo "$jena already unpacked"
fi
# create data directory
if [ ! -d $data ]
then
  echo "creating $data directory"
  mkdir -p $data
else
  echo "$data directory already created"
fi
}

#
# show the given timestamp
#
timestamp() {
 local msg="$1"
 local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
 echo "$msg at $ts"
}

#
# load data for the given data dir and input
#
loaddata() {
	local data="$1"
	local input="$2"
  timestamp "start loading $input to $data"
  $tdbloader --loader=parallel --loc "$data" "$input" > tdb2-out.log 2> tdb2-err.log
	timestamp "finished loading $input to $data"
}

getjena
export TMPDIR=$base/tmp
if [ ! -d $TMPDIR ]
then
  echo "creating temporary directory $TMPDIR"
  mkdir $TMPDIR
else
  echo "using temporary directory $TMPDIR"
fi
loaddata $data latest-all.nt