Difference between revisions of "WikiData Import 2022-06-25"

From BITPlan Wiki
Jump to navigation Jump to search
 
(14 intermediate revisions by the same user not shown)
Line 1: Line 1:
{{PageSequence|prev=WikiData Import 2022-06-24|next=|category=WikiData|categoryIcon=cloud-download}}
+
{{PageSequence|prev=WikiData Import 2022-06-24|next=WikiData Import 2022-07-20|category=WikiData|categoryIcon=cloud-download}}
 
retry with more disk space
 
retry with more disk space
 +
{{Import
 +
|target=QLever
 +
|url=https://wiki.bitplan.com/index.php/WikiData_Import_2022-06-25
 +
|start=2022-06-25
 +
|end=2022-06-27
 +
|days=1.1
 +
|state=✅
 +
|cpu=Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 +
|ram=128
 +
}}
 +
= Context =
 +
see {{Link|target=QLever/script}} as discussed in [https://github.com/ad-freiburg/qlever/issues/562 QLever Issue #562] for the script which makes reproducing this attempt easier.
 +
 +
see [https://github.com/ad-freiburg/qlever/discussions/668 QLever Discussions] for more details on this attempt series.
 +
 +
since the https://github.com/ad-freiburg/qlever-control now has an official "qlever" script we have rename the script
 +
that has the purpose to make the import attempts reproducible to '''qleverauto'''.
 +
 +
Beware of https://github.com/ad-freiburg/qlever-control/issues/4 - make sure ulimit -n is set!. This attempt had to be restarted since setting the value within a script did not
 +
work.
 +
 
= Preparations =
 
= Preparations =
 
see {{Link|target=WikiData_Import_2022-06-24#Preparations}}
 
see {{Link|target=WikiData_Import_2022-06-24#Preparations}}
Line 44: Line 65:
 
wikidata dump download finished at So 26. Jun 01:24:35 CEST 2022 after 20332 seconds
 
wikidata dump download finished at So 26. Jun 01:24:35 CEST 2022 after 20332 seconds
 
</source>
 
</source>
 +
== qleverauto environment checks ==
 +
<source lang='bash' highlight='1,3'>
 +
./qleverauto -v
 +
qleverauto version : 1.29 $ : 2022/05/23 06:15:28 $
 +
./qleverauto -e
 +
needed software
 +
docker → /usr/bin/docker ✅
 +
top → /usr/bin/top ✅
 +
df → /usr/bin/df ✅
 +
jq → /usr/bin/jq ✅
 +
lsb_release → /usr/bin/lsb_release ✅
 +
free → /usr/bin/free ✅
 +
operating system
 +
No LSB modules are available.
 +
Distributor ID: Ubuntu
 +
Description: Ubuntu 20.04.4 LTS
 +
Release: 20.04
 +
Codename: focal
 +
docker version
 +
Docker version 20.10.16, build aa7e414
 +
memory
 +
              total        used        free      shared  buff/cache  available
 +
Mem:          125Gi      1,8Gi        30Gi        27Mi        93Gi      122Gi
 +
Swap:        2,0Gi        57Mi      1,9Gi
 +
diskspace
 +
/dev/sdb5      116G  25G  86G  23% /
 +
tmpfs            63G  16K  63G  1% /dev/shm
 +
/dev/sda1      3,6T  183G  3,3T  6% /hd/seel
 +
/dev/sdb1      511M  4,0K  511M  1% /boot/efi
 +
soft ulimit for files
 +
1048576
 +
</source>
 +
== wikidata files and index settings ==
 +
<source lang='bash' highlight='1,3'>
 +
wikidata$ ls -l
 +
total 93512520
 +
-rw-rw-r-- 1 wf wf 95427655992 Jun 23 10:50 latest-all.ttl.bz2
 +
-rw-rw-r-- 1 wf wf  329141448 Jun 25 01:28 latest-lexemes.ttl.bz2
 +
-rw-rw-r-- 1 wf wf        1188 Jun 26 08:08 Qleverfile
 +
drwxrwxr-x 2 wf wf        4096 Jun 24 10:09 RCS
 +
wf@sun:/hd/seel/qlever/wikidata$ vi wikidata.settings.json
 +
wf@sun:/hd/seel/qlever/wikidata$ rcsdiff ./Qleverfile
 +
===================================================================
 +
RCS file: ./RCS/Qleverfile,v
 +
retrieving revision 1.3
 +
diff -r1.3 ./Qleverfile
 +
wf@sun:/hd/seel/qlever/wikidata$ rcsdiff ./wikidata.settings.json
 +
===================================================================
 +
RCS file: ./RCS/wikidata.settings.json,v
 +
retrieving revision 1.2
 +
diff -r1.2 ./wikidata.settings.json
 +
</source>
 +
=== Qleverfile ===
 +
<source lang='bash' highlight='1'>
 +
cat Qleverfile
 +
# Qleverfile for folder /hd/seel/qlever
 +
# Automatically created on Sa 21. Mai 09:09:41 CEST 2022.
 +
# Modify or expand as you see fit.
 +
 +
# Indexer settings
 +
DB              = wikidata
 +
RDF_FILES        = "latest-all.ttl.bz2 latest-lexemes.ttl.bz2"
 +
CAT_FILES        = "bzcat ${RDF_FILES}"
 +
WITH_TEXT        = false
 +
SETTINGS_JSON    = '{ "languages-internal": ["en"], "prefixes-external": [ "<http://www.wikidata.org/entity/statement", "<http://www.wikidata.org/value", "<http://www.wikidata.org/reference" ], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 10000000 }'
 +
# Server settings
 +
HOSTNAME                      = sun.bitplan.com
 +
SERVER_PORT                    = 7001
 +
MEMORY_FOR_QUERIES            = 10
 +
CACHE_MAX_SIZE_GB              = 5
 +
CACHE_MAX_SIZE_GB_SINGLE_ENTRY = 1
 +
CACHE_MAX_NUM_ENTRIES          = 100
 +
 +
# QLever binaries
 +
QLEVER_BIN_DIR          = /hd/seel/qlever/qlever-code/build/
 +
USE_DOCKER              = false
 +
QLEVER_DOCKER_IMAGE    = adfreiburg/qlever
 +
QLEVER_DOCKER_CONTAINER = qlever.must_specify
 +
 +
# QLever UI
 +
QLEVERUI_PORT  = 7000
 +
QLEVERUI_DIR    = qlever-ui
 +
QLEVERUI_CONFIG = default
 +
</source>
 +
=== wikidata.settings.json ===
 +
<source lang='bash' highlight='1'>
 +
cat wikidata.settings.json
 +
{
 +
  "languages-internal": ["en"],
 +
  "prefixes-external": [
 +
    "<http://www.wikidata.org/entity/statement",
 +
    "<http://www.wikidata.org/value",
 +
    "<http://www.wikidata.org/reference"
 +
  ],
 +
  "locale": {
 +
  "language": "en",
 +
  "country": "US",
 +
  "ignore-punctuation": true
 +
  },
 +
  "ascii-prefixes-only": true,
 +
  "num-triples-per-batch" : 10000000
 +
}
 +
</source>
 +
= Indexer =
 +
== Relevant part of qleverauto script ==
 +
<source lang='bash'>
 +
#
 +
# build the wikidata index
 +
#
 +
wikidata_index() {
 +
  cd $QLEVER_HOME/wikidata
 +
  chmod o+w .
 +
  show_timing "creating wikidata index" "started"
 +
#  docker run -i --rm -v $QLEVER_HOME/qlever-indices/wikidata:/index --entrypoint bash $dockerimage  -c "cd /index && bzcat latest-all.ttl.bz2 latest-lexemes.ttl.bz2 | IndexBuilderMain -F ttl -f - -l -i wikidata -s wikidata.settings.json | tee wikidata.index-log.txt"
 +
  . ../qlever-control/qlever
 +
  check_installed IndexBuilderMain
 +
  qlever index
 +
  show_timing "creating wikidata index" "finished"
 +
}
 +
</source>
 +
=== Symbolic Link for IndexBuilder Main ===
 +
<source lang='bash' highlight='1,3'>
 +
~/bin$ ls -l IndexBuilderMain
 +
lrwxrwxrwx 1 wf wf 50 Mai 22 17:47 IndexBuilderMain -> /hd/seel/qlever/qlever-code/build/IndexBuilderMain
 +
which IndexBuilderMain
 +
/home/wf/bin/IndexBuilderMain
 +
</source>
 +
 +
== start ==
 +
 +
<source lang='bash' highlight='1,2,4-5'>
 +
ulimit -n 1000000 
 +
ulimit -a | grep '(-n)'
 +
open files                      (-n) 1000000
 +
nohup ./qleverauto -wi&
 +
tail -f nohup.out
 +
bzcat latest-all.ttl.bz2 latest-lexemes.ttl.bz2 | IndexBuilderMain -F ttl -K wikidata -f - -i wikidata -s wikidata.settings.json | tee wikidata.index-log.txt
 +
 +
2022-06-26 08:14:31.955 - INFO:  QLever IndexBuilder, compiled on Jun 24 2022 09:48:21
 +
2022-06-26 08:14:31.955 - INFO:  You specified the input format: TTL
 +
2022-06-26 08:14:31.956 - INFO:  You specified "locale = en_US" and "ignore-punctuation = 1"
 +
2022-06-26 08:14:31.956 - INFO:  You specified "ascii-prefixes-only = true", which enables faster parsing for well-behaved TTL files
 +
2022-06-26 08:14:31.956 - INFO:  You specified "num-triples-per-batch = 10,000,000", choose a lower value if the index builder runs out of memory
 +
2022-06-26 08:14:31.956 - INFO:  Integers that cannot be represented by QLever will throw an exception (this is the default behavior)
 +
2022-06-26 08:14:31.956 - INFO:  Processing input triples from /dev/stdin ...
 +
2022-06-26 08:16:54.205 - INFO:  Input triples processed: 100,000,000
 +
2022-06-26 08:19:03.563 - INFO:  Input triples processed: 200,000,000
 +
...
 +
2022-06-27 04:05:48.950 - INFO:  Creating a pair of index permutations ...
 +
2022-06-27 06:00:17.967 - INFO:  Statistics for SPO: #relations = 2,808,258,381, #blocks = 30,045, #triples = 23,623,846,556
 +
2022-06-27 06:00:17.967 - INFO:  Exchanging multiplicities for SPO and SOP ...
 +
2022-06-27 06:42:31.181 - INFO:  Writing meta data for SPO and SOP ...
 +
2022-06-27 06:42:47.139 - INFO:  Number of distinct patterns: 7,168,422
 +
2022-06-27 06:42:47.143 - INFO:  Number of subjects with pattern: 2,808,258,381 [all]
 +
2022-06-27 06:42:47.143 - INFO:  Total number of distinct subject-predicate pairs: 11,082,607,100
 +
2022-06-27 06:42:47.143 - INFO:  Average number of predicates per subject: 3.9
 +
2022-06-27 06:42:47.145 - INFO:  Average number of subjects per predicate: 236,545
 +
2022-06-27 08:07:24.260 - INFO:  Creating a pair of index permutations ...
 +
2022-06-27 09:52:23.085 - INFO:  Statistics for OSP: #relations = 3,179,186,092, #blocks = 39,688, #triples = 23,623,846,556
 +
2022-06-27 09:52:23.101 - INFO:  Statistics for OPS: #relations = 3,179,186,092, #blocks = 39,688, #triples = 23,623,846,556
 +
2022-06-27 09:52:23.101 - INFO:  Exchanging multiplicities for OSP and OPS ...
 +
2022-06-27 10:39:15.273 - INFO:  Writing meta data for OSP and OPS ...
 +
2022-06-27 10:39:20.510 - INFO:  Index build completed
 +
</source>
 +
 +
= Progress =
 +
[[File:WikidataImportQLever2022-06-25.jpg|800px]]

Latest revision as of 19:23, 15 May 2023

retry with more disk space

Import
state  ✅
url  https://wiki.bitplan.com/index.php/WikiData_Import_2022-06-25
target  QLever
start  2022-06-25
end  2022-06-27
days  1.1
os  
cpu  Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
ram  128
triples  
comment  

Context

see QLever/script as discussed in QLever Issue #562 for the script which makes reproducing this attempt easier.

see QLever Discussions for more details on this attempt series.

since the https://github.com/ad-freiburg/qlever-control now has an official "qlever" script we have rename the script that has the purpose to make the import attempts reproducible to qleverauto.

Beware of https://github.com/ad-freiburg/qlever-control/issues/4 - make sure ulimit -n is set!. This attempt had to be restarted since setting the value within a script did not work.

Preparations

see WikiData_Import_2022-06-24#Preparations

Wikidata data download

df
Filesystem      1K-blocks      Used  Available Use% Mounted on
/dev/sda1      3844660232 97397140 3551895876   3% /hd/seel
 ./qleverauto -wd
downloading wikidata lexemes:latest-lexemes.ttl.bz2 ... please wait typically 3min ...
wikidata lexemes download started at Sa 25. Jun 19:44:32 CEST 2022
--2022-06-25 19:44:32--  https://dumps.wikimedia.org/wikidatawiki/entities//latest-lexemes.ttl.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620:0:861:1:208:80:154:7, 208.80.154.7
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620:0:861:1:208:80:154:7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 329141448 (314M) [application/octet-stream]
Saving to: ‘latest-lexemes.ttl.bz2’

latest-lexemes.ttl.bz2       10%[===>                                        ]  34,29M  4,60MB/s    eta 64s 
...
latest-lexemes.ttl.bz2      100%[===========================================>] 313,89M  4,56MB/s    in 70s     

2022-06-25 19:45:43 (4,49 MB/s) - ‘latest-lexemes.ttl.bz2’ saved [329141448/329141448]

wikidata lexemes download finished at Sa 25. Jun 19:45:43 CEST 2022 after 71 seconds
downloading wikidata dump:latest-all.ttl.bz2 ... please wait typically 6hours ...
wikidata dump download started at Sa 25. Jun 19:45:43 CEST 2022
--2022-06-25 19:45:43--  https://dumps.wikimedia.org/wikidatawiki/entities//latest-all.ttl.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620:0:861:1:208:80:154:7, 208.80.154.7
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620:0:861:1:208:80:154:7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 95427655992 (89G) [application/octet-stream]
Saving to: ‘latest-all.ttl.bz2’

latest-all.ttl.bz2            0%[                                            ]  50,66M  4,42MB/s    eta 5h 38m
...
latest-all.ttl.bz2          100%[===========================================>]  88,87G  5,01MB/s    in 5h 38m  

2022-06-26 01:24:35 (4,48 MB/s) - ‘latest-all.ttl.bz2’ saved [95427655992/95427655992]

wikidata dump download finished at So 26. Jun 01:24:35 CEST 2022 after 20332 seconds

qleverauto environment checks

./qleverauto -v
qleverauto version : 1.29 $ : 2022/05/23 06:15:28 $
./qleverauto -e
needed software
docker → /usr/bin/docker ✅
top → /usr/bin/top ✅
df → /usr/bin/df ✅
jq → /usr/bin/jq ✅
lsb_release → /usr/bin/lsb_release ✅
free → /usr/bin/free ✅
operating system
No LSB modules are available.
Distributor ID:	Ubuntu
Description:	Ubuntu 20.04.4 LTS
Release:	20.04
Codename:	focal
docker version
Docker version 20.10.16, build aa7e414
memory
              total        used        free      shared  buff/cache   available
Mem:          125Gi       1,8Gi        30Gi        27Mi        93Gi       122Gi
Swap:         2,0Gi        57Mi       1,9Gi
diskspace
/dev/sdb5       116G   25G   86G  23% /
tmpfs            63G   16K   63G   1% /dev/shm
/dev/sda1       3,6T  183G  3,3T   6% /hd/seel
/dev/sdb1       511M  4,0K  511M   1% /boot/efi
soft ulimit for files
1048576

wikidata files and index settings

wikidata$ ls -l
total 93512520
-rw-rw-r-- 1 wf wf 95427655992 Jun 23 10:50 latest-all.ttl.bz2
-rw-rw-r-- 1 wf wf   329141448 Jun 25 01:28 latest-lexemes.ttl.bz2
-rw-rw-r-- 1 wf wf        1188 Jun 26 08:08 Qleverfile
drwxrwxr-x 2 wf wf        4096 Jun 24 10:09 RCS
wf@sun:/hd/seel/qlever/wikidata$ vi wikidata.settings.json
wf@sun:/hd/seel/qlever/wikidata$ rcsdiff ./Qleverfile 
===================================================================
RCS file: ./RCS/Qleverfile,v
retrieving revision 1.3
diff -r1.3 ./Qleverfile
wf@sun:/hd/seel/qlever/wikidata$ rcsdiff ./wikidata.settings.json 
===================================================================
RCS file: ./RCS/wikidata.settings.json,v
retrieving revision 1.2
diff -r1.2 ./wikidata.settings.json

Qleverfile

cat Qleverfile 
# Qleverfile for folder /hd/seel/qlever
# Automatically created on Sa 21. Mai 09:09:41 CEST 2022.
# Modify or expand as you see fit.

# Indexer settings
DB               = wikidata 
RDF_FILES        = "latest-all.ttl.bz2 latest-lexemes.ttl.bz2"
CAT_FILES        = "bzcat ${RDF_FILES}"
WITH_TEXT        = false
SETTINGS_JSON    = '{ "languages-internal": ["en"], "prefixes-external": [ "<http://www.wikidata.org/entity/statement", "<http://www.wikidata.org/value", "<http://www.wikidata.org/reference" ], "locale": { "language": "en", "country": "US", "ignore-punctuation": true }, "ascii-prefixes-only": true, "num-triples-per-batch": 10000000 }'
# Server settings
HOSTNAME                       = sun.bitplan.com
SERVER_PORT                    = 7001
MEMORY_FOR_QUERIES             = 10
CACHE_MAX_SIZE_GB              = 5
CACHE_MAX_SIZE_GB_SINGLE_ENTRY = 1
CACHE_MAX_NUM_ENTRIES          = 100

# QLever binaries
QLEVER_BIN_DIR          = /hd/seel/qlever/qlever-code/build/ 
USE_DOCKER              = false
QLEVER_DOCKER_IMAGE     = adfreiburg/qlever
QLEVER_DOCKER_CONTAINER = qlever.must_specify

# QLever UI
QLEVERUI_PORT   = 7000
QLEVERUI_DIR    = qlever-ui
QLEVERUI_CONFIG = default

wikidata.settings.json

cat wikidata.settings.json 
{
  "languages-internal": ["en"],
  "prefixes-external": [
    "<http://www.wikidata.org/entity/statement",
    "<http://www.wikidata.org/value",
    "<http://www.wikidata.org/reference"
  ],
  "locale": {
	  "language": "en",
	  "country": "US",
	  "ignore-punctuation": true
  },
  "ascii-prefixes-only": true,
  "num-triples-per-batch" : 10000000
}

Indexer

Relevant part of qleverauto script

#
# build the wikidata index
#
wikidata_index() {
   cd $QLEVER_HOME/wikidata
   chmod o+w .
   show_timing "creating wikidata index" "started"
#   docker run -i --rm -v $QLEVER_HOME/qlever-indices/wikidata:/index --entrypoint bash $dockerimage  -c "cd /index && bzcat latest-all.ttl.bz2 latest-lexemes.ttl.bz2 | IndexBuilderMain -F ttl -f - -l -i wikidata -s wikidata.settings.json | tee wikidata.index-log.txt"
   . ../qlever-control/qlever
   check_installed IndexBuilderMain
   qlever index
   show_timing "creating wikidata index" "finished"
}

Symbolic Link for IndexBuilder Main

~/bin$ ls -l IndexBuilderMain 
lrwxrwxrwx 1 wf wf 50 Mai 22 17:47 IndexBuilderMain -> /hd/seel/qlever/qlever-code/build/IndexBuilderMain
which IndexBuilderMain 
/home/wf/bin/IndexBuilderMain

start

ulimit -n 1000000  
ulimit -a | grep '(-n)'
open files                      (-n) 1000000
nohup ./qleverauto -wi&
tail -f nohup.out
bzcat latest-all.ttl.bz2 latest-lexemes.ttl.bz2 | IndexBuilderMain -F ttl -K wikidata -f - -i wikidata -s wikidata.settings.json | tee wikidata.index-log.txt

2022-06-26 08:14:31.955	- INFO:  QLever IndexBuilder, compiled on Jun 24 2022 09:48:21
2022-06-26 08:14:31.955	- INFO:  You specified the input format: TTL
2022-06-26 08:14:31.956	- INFO:  You specified "locale = en_US" and "ignore-punctuation = 1"
2022-06-26 08:14:31.956	- INFO:  You specified "ascii-prefixes-only = true", which enables faster parsing for well-behaved TTL files
2022-06-26 08:14:31.956	- INFO:  You specified "num-triples-per-batch = 10,000,000", choose a lower value if the index builder runs out of memory
2022-06-26 08:14:31.956	- INFO:  Integers that cannot be represented by QLever will throw an exception (this is the default behavior)
2022-06-26 08:14:31.956	- INFO:  Processing input triples from /dev/stdin ...
2022-06-26 08:16:54.205	- INFO:  Input triples processed: 100,000,000
2022-06-26 08:19:03.563	- INFO:  Input triples processed: 200,000,000
...
2022-06-27 04:05:48.950	- INFO:  Creating a pair of index permutations ... 
2022-06-27 06:00:17.967	- INFO:  Statistics for SPO: #relations = 2,808,258,381, #blocks = 30,045, #triples = 23,623,846,556
2022-06-27 06:00:17.967	- INFO:  Exchanging multiplicities for SPO and SOP ...
2022-06-27 06:42:31.181	- INFO:  Writing meta data for SPO and SOP ...
2022-06-27 06:42:47.139	- INFO:  Number of distinct patterns: 7,168,422
2022-06-27 06:42:47.143	- INFO:  Number of subjects with pattern: 2,808,258,381 [all]
2022-06-27 06:42:47.143	- INFO:  Total number of distinct subject-predicate pairs: 11,082,607,100
2022-06-27 06:42:47.143	- INFO:  Average number of predicates per subject: 3.9
2022-06-27 06:42:47.145	- INFO:  Average number of subjects per predicate: 236,545
2022-06-27 08:07:24.260	- INFO:  Creating a pair of index permutations ... 
2022-06-27 09:52:23.085	- INFO:  Statistics for OSP: #relations = 3,179,186,092, #blocks = 39,688, #triples = 23,623,846,556
2022-06-27 09:52:23.101	- INFO:  Statistics for OPS: #relations = 3,179,186,092, #blocks = 39,688, #triples = 23,623,846,556
2022-06-27 09:52:23.101	- INFO:  Exchanging multiplicities for OSP and OPS ...
2022-06-27 10:39:15.273	- INFO:  Writing meta data for OSP and OPS ...
2022-06-27 10:39:20.510	- INFO:  Index build completed

Progress

WikidataImportQLever2022-06-25.jpg