Difference between revisions of "WikiData Import 2020-07-30"
Jump to navigation
Jump to search
Line 39: | Line 39: | ||
Sat Aug 1 10:52:57 CEST 2020 | Sat Aug 1 10:52:57 CEST 2020 | ||
</source> | </source> | ||
+ | = Scripts = | ||
+ | == wikidata2jena == | ||
+ | <source lang='bash'> | ||
+ | #!/bin/bash | ||
+ | # WF 2020-05-10 | ||
+ | # global settings | ||
+ | jena=apache-jena-3.16.0 | ||
+ | tgz=$jena.tar.gz | ||
+ | jenaurl=http://mirror.easyname.ch/apache/jena/binaries/$tgz | ||
+ | base=/Volumes/Torterra/wikidata2020-08-01 | ||
+ | data=$base/data | ||
+ | tdbloader=$jena/bin/tdb2.tdbloader | ||
+ | |||
+ | getjena() { | ||
+ | # download | ||
+ | if [ ! -f $tgz ] | ||
+ | then | ||
+ | echo "downloading $tgz from $jenaurl" | ||
+ | wget $jenaurl | ||
+ | else | ||
+ | echo "$tgz already downloaded" | ||
+ | fi | ||
+ | # unpack | ||
+ | if [ ! -d $jena ] | ||
+ | then | ||
+ | echo "unpacking $jena from $tgz" | ||
+ | tar xvzf $tgz | ||
+ | else | ||
+ | echo "$jena already unpacked" | ||
+ | fi | ||
+ | # create data directory | ||
+ | if [ ! -d $data ] | ||
+ | then | ||
+ | echo "creating $data directory" | ||
+ | mkdir -p $data | ||
+ | else | ||
+ | echo "$data directory already created" | ||
+ | fi | ||
+ | } | ||
+ | |||
+ | # | ||
+ | # show the given timestamp | ||
+ | # | ||
+ | timestamp() { | ||
+ | local msg="$1" | ||
+ | local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ") | ||
+ | echo "$msg at $ts" | ||
+ | } | ||
+ | |||
+ | # | ||
+ | # load data for the given data dir and input | ||
+ | # | ||
+ | loaddata() { | ||
+ | local data="$1" | ||
+ | local input="$2" | ||
+ | timestamp "start loading $input to $data" | ||
+ | $tdbloader --loader=parallel --loc "$data" "$input" > tdb2-out.log 2> tdb2-err.log | ||
+ | timestamp "finished loading $input to $data" | ||
+ | } | ||
+ | |||
+ | getjena | ||
+ | export TMPDIR=$base/tmp | ||
+ | if [ ! -d $TMPDIR ] | ||
+ | then | ||
+ | echo "creating temporary directory $TMPDIR" | ||
+ | mkdir $TMPDIR | ||
+ | else | ||
+ | echo "using temporary directory $TMPDIR" | ||
+ | fi | ||
+ | loaddata $data latest-all.nt | ||
+ | </source> | ||
[[Category:WikiData]] | [[Category:WikiData]] |
Revision as of 12:44, 1 August 2020
Download and unpack
This download was done with the "latest-all.nt" dataset.
wget https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2
--2020-07-30 06:40:18-- https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.nt.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620::861:1:208:80:154:7, 208.80.154.7
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620::861:1:208:80:154:7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118776910150 (111G) [application/octet-stream]
Saving to: ‘latest-all.nt.bz2’
latest-all.nt.bz2 0%[ ] 635.10M 5.02MB/s eta 6h 19m
...
latest-all.nt.bz2 100%[===================>] 110.62G 4.88MB/s in 6h 31m
2020-07-30 13:11:49 (4.82 MB/s) - ‘latest-all.nt.bz2’ saved [118776910150/118776910150]
bzip2 -dk latest-all.nt.bz2
ls -l latest-all.nt
-rw------- 1 wf admin 1980899328 Jul 30 17:56 latest-all.nt
# failed to limited disk space
# retry next morning
nohup bzip2 -ckd latest-all.nt.bz2 > /Volumes/Torterra/wikidata2020-07-31/latest-all.nt&
bzip2 issue
the first bzip failed - see retry for how bzip is used to extract from rotating disk to SSD
bzip2: I/O or other error, bailing out. Possible reason follows.
bzip2: No space left on device
Input file = latest-all.nt.bz2, output file = latest-all.nt
bzip2: Deleting output file latest-all.nt, if it exists.
Counting an copying
date;cp -p /Volumes/Torterra/wikidata2020-07-31/latest-all.nt .;date
Sat Aug 1 07:42:32 CEST 2020
Sat Aug 1 12:21:06 CEST 2020
date;wc -l latest-all.nt;date
Sat Aug 1 07:42:43 CEST 2020
13738317356 latest-all.nt
Sat Aug 1 10:52:57 CEST 2020
Scripts
wikidata2jena
#!/bin/bash
# WF 2020-05-10
# global settings
jena=apache-jena-3.16.0
tgz=$jena.tar.gz
jenaurl=http://mirror.easyname.ch/apache/jena/binaries/$tgz
base=/Volumes/Torterra/wikidata2020-08-01
data=$base/data
tdbloader=$jena/bin/tdb2.tdbloader
getjena() {
# download
if [ ! -f $tgz ]
then
echo "downloading $tgz from $jenaurl"
wget $jenaurl
else
echo "$tgz already downloaded"
fi
# unpack
if [ ! -d $jena ]
then
echo "unpacking $jena from $tgz"
tar xvzf $tgz
else
echo "$jena already unpacked"
fi
# create data directory
if [ ! -d $data ]
then
echo "creating $data directory"
mkdir -p $data
else
echo "$data directory already created"
fi
}
#
# show the given timestamp
#
timestamp() {
local msg="$1"
local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
echo "$msg at $ts"
}
#
# load data for the given data dir and input
#
loaddata() {
local data="$1"
local input="$2"
timestamp "start loading $input to $data"
$tdbloader --loader=parallel --loc "$data" "$input" > tdb2-out.log 2> tdb2-err.log
timestamp "finished loading $input to $data"
}
getjena
export TMPDIR=$base/tmp
if [ ! -d $TMPDIR ]
then
echo "creating temporary directory $TMPDIR"
mkdir $TMPDIR
else
echo "using temporary directory $TMPDIR"
fi
loaddata $data latest-all.nt