Difference between revisions of "Wikidata Import 2023-04-26"
(→Import) |
|||
(112 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
− | {{PageSequence|prev=Wikidata Import 2023-04-18|next=Wikidata Import 2023- | + | {{PageSequence|prev=Wikidata Import 2023-04-18|next=Wikidata Import 2023-05-03|category=Wikidata|categoryIcon=cloud-download}} |
+ | |||
+ | =Import= | ||
+ | |||
+ | {{Import | ||
+ | |target=blazegraph | ||
+ | |state=❌ | ||
+ | |start=2023-04-26 | ||
+ | |os=Ubuntu 22.04.2 LTS | ||
+ | |cpu=Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz | ||
+ | |url=https://wiki.bitplan.com/index.php/Wikidata_Import_2023-04-26 | ||
+ | |ram=256 | ||
+ | |triples=14.7 | ||
+ | |storemode=property | ||
+ | |comment=target disk is a rotating disk which is an order of magnitude slower see https://github.com/blazegraph/database/wiki/IOOptimization | ||
+ | }} | ||
+ | |||
+ | = Download ~9 hours = | ||
== Download Options == | == Download Options == | ||
https://dumps.wikimedia.org/wikidatawiki/entities | https://dumps.wikimedia.org/wikidatawiki/entities | ||
Line 18: | Line 35: | ||
latest-truthy.nt.gz 21-Apr-2023 14:24 59704444949 | latest-truthy.nt.gz 21-Apr-2023 14:24 59704444949 | ||
</pre> | </pre> | ||
+ | == download result == | ||
+ | <source lang='bash' highlight='1'> | ||
+ | ls -l latest*.gz | ||
+ | -rw-rw-r-- 1 wf wf 123942927864 Apr 26 10:18 latest-all.ttl.gz | ||
+ | -rw-rw-r-- 1 wf wf 540610049 Apr 22 01:25 latest-lexemes.ttl.gz | ||
+ | </source> | ||
== download script == | == download script == | ||
<source lang='bash'> | <source lang='bash'> | ||
Line 92: | Line 115: | ||
2023-04-26 15:40:39 (4.27 MB/s) - ‘latest-lexemes.ttl.gz’ saved [540610049/540610049] | 2023-04-26 15:40:39 (4.27 MB/s) - ‘latest-lexemes.ttl.gz’ saved [540610049/540610049] | ||
</source> | </source> | ||
− | ==== latest-lexemes.ttl.bz2 {{Done}} ==== | + | ==== latest-lexemes.ttl.bz2 43340123 1m45s {{Done}} ==== |
===== attempt by script ❌ ===== | ===== attempt by script ❌ ===== | ||
<source lang='bash'> | <source lang='bash'> | ||
Line 113: | Line 136: | ||
latest-lexemes.ttl. 9%[> ] 38.66M 4.19MB/s eta 1m 53 | latest-lexemes.ttl. 9%[> ] 38.66M 4.19MB/s eta 1m 53 | ||
+ | latest-lexemes.ttl. 100%[===================>] 413.32M 4.25MB/s in 1m 45s | ||
+ | |||
+ | 2023-04-28 13:45:57 (3.93 MB/s) - ‘latest-lexemes.ttl.bz2’ saved [433401231/433401231] | ||
+ | </source> | ||
+ | |||
+ | = Munging ~29 h = | ||
+ | |||
+ | * https://github.com/wikimedia/wikidata-query-rdf/blob/master/docs/getting-started.md | ||
+ | * https://github.com/wikimedia/wikidata-query-deploy/blob/master/munge.sh | ||
+ | == Preparation ~20-30 min == | ||
+ | === Needed installs and settings === | ||
+ | <source lang='bash' highlight='1-4'> | ||
+ | sudo apt install openjdk-11-jdk-headless | ||
+ | sudo apt install maven | ||
+ | export JAVA_HOME=$(update-alternatives --query javadoc | grep Value: | head -n1 | sed 's/Value: //' | sed 's@bin/javadoc$@@') | ||
+ | echo $JAVA_HOME | ||
+ | /usr/lib/jvm/java-11-openjdk-amd64/ | ||
+ | </source> | ||
+ | |||
+ | === clone and package === | ||
+ | had to start mvn package twice since javadoc was not available JAVA_HOME was not set on first try | ||
+ | <source lang='bash' highlight='1,7-8'> | ||
+ | git clone https://gerrit.wikimedia.org/r/wikidata/query/rdf wikidata-query-rdf | ||
+ | Cloning into 'wikidata-query-rdf'... | ||
+ | remote: Counting objects: 111, done | ||
+ | remote: Total 26684 (delta 0), reused 26684 (delta 0) | ||
+ | Receiving objects: 100% (26684/26684), 4.84 MiB | 3.23 MiB/s, done. | ||
+ | Resolving deltas: 100% (13928/13928), done. | ||
+ | cd wikidata-query-rdf/ | ||
+ | mvn package | ||
+ | [INFO] Building jar: /home/wf/wikidata-query-rdf/common/target/wikidata-query-common-0.3.124-SNAPSHOT.jar | ||
+ | [INFO] | ||
+ | [INFO] --- maven-javadoc-plugin:3.2.0:jar (attach-javadocs) @ common --- | ||
+ | [INFO] ------------------------------------------------------------------------ | ||
+ | [INFO] Reactor Summary for Wikidata Query Service 0.3.124-SNAPSHOT: | ||
+ | [INFO] | ||
+ | [INFO] Wikidata Query Service ............................. SUCCESS [ 19.204 s] | ||
+ | ... | ||
+ | [INFO] --- maven-assembly-plugin:3.3.0:single (default) @ service --- | ||
+ | [INFO] Reading assembly descriptor: src/assembly/dist.xml | ||
+ | [INFO] Building tar: /home/wf/wikidata-query-rdf/dist/target/service-0.3.124-SNAPSHOT-dist.tar.gz | ||
+ | [INFO] ------------------------------------------------------------------------ | ||
+ | [INFO] Reactor Summary for Wikidata Query Service 0.3.124-SNAPSHOT: | ||
+ | [INFO] | ||
+ | [INFO] Wikidata Query Service ............................. SUCCESS [ 1.904 s] | ||
+ | [INFO] Shared code ........................................ SUCCESS [ 4.798 s] | ||
+ | [INFO] Wikidata Query RDF Testing Tools ................... SUCCESS [ 15.411 s] | ||
+ | [INFO] Jetty logging dependencies ......................... SUCCESS [ 37.577 s] | ||
+ | [INFO] Blazegraph extension to improve performance for Wikibase SUCCESS [03:14 min] | ||
+ | [INFO] Blazegraph Service Package ......................... SUCCESS [ 33.788 s] | ||
+ | [INFO] Wikidata Query RDF Tools ........................... SUCCESS [01:13 min] | ||
+ | [INFO] Wikidata Query Service Streaming Updater - Common .. SUCCESS [ 4.758 s] | ||
+ | [INFO] Wikidata Query Service Streaming Updater - Producer SUCCESS [04:34 min] | ||
+ | [INFO] Wikidata Query Service Streaming Updater - Consumer SUCCESS [ 7.982 s] | ||
+ | [INFO] MediaWiki OAuth 1.0a Proxy Service ................. SUCCESS [ 37.591 s] | ||
+ | [INFO] rdf-spark-tools .................................... SUCCESS [09:44 min] | ||
+ | [INFO] Wikibase RDF Query Service ......................... SUCCESS [ 10.378 s] | ||
+ | [INFO] ------------------------------------------------------------------------ | ||
+ | [INFO] BUILD SUCCESS | ||
+ | [INFO] ------------------------------------------------------------------------ | ||
+ | [INFO] Total time: 21:21 min | ||
+ | [INFO] Finished at: 2023-04-29T14:05:59+02:00 | ||
+ | [INFO] ------------------------------------------------------------------------ | ||
+ | |||
+ | </source> | ||
+ | === check dist === | ||
+ | <source lang='bash' highlight='1-2'> | ||
+ | cd dist/target | ||
+ | ~/wikidata-query-rdf/dist/target$ tar tvfz service-0.3.124-SNAPSHOT-dist.tar.gz | ||
+ | drwxrwxr-x wf/wf 0 2023-04-29 13:32 service-0.3.124-SNAPSHOT/ | ||
+ | -rw-rw-r-- wf/wf 1170 2023-04-29 13:32 service-0.3.124-SNAPSHOT/prefixes-sdc.conf | ||
+ | -rwxrwxr-x wf/wf 1277 2023-04-29 13:32 service-0.3.124-SNAPSHOT/wcqs-data-reload.sh | ||
+ | -rwxrwxr-x wf/wf 2656 2023-04-29 13:32 service-0.3.124-SNAPSHOT/runUpdate.sh | ||
+ | -rwxrwxr-x wf/wf 599 2023-04-29 13:32 service-0.3.124-SNAPSHOT/createNamespace.sh | ||
+ | -rw-rw-r-- wf/wf 1483 2023-04-29 13:32 service-0.3.124-SNAPSHOT/default.properties | ||
+ | -rwxrwxr-x wf/wf 6470 2023-04-29 13:32 service-0.3.124-SNAPSHOT/runBlazegraph.sh | ||
+ | -rwxrwxr-x wf/wf 1345 2023-04-29 13:32 service-0.3.124-SNAPSHOT/loadRestAPI.sh | ||
+ | -rwxrwxr-x wf/wf 490 2023-04-29 13:32 service-0.3.124-SNAPSHOT/forAllCategoryWikis.sh | ||
+ | -rwxrwxr-x wf/wf 857 2023-04-29 13:32 service-0.3.124-SNAPSHOT/munge.sh | ||
+ | -rwxrwxr-x wf/wf 1133 2023-04-29 13:32 service-0.3.124-SNAPSHOT/loadCategoryDaily.sh | ||
+ | -rwxrwxr-x wf/wf 882 2023-04-29 13:32 service-0.3.124-SNAPSHOT/loadData.sh | ||
+ | -rw-rw-r-- wf/wf 3412 2023-04-29 13:32 service-0.3.124-SNAPSHOT/RWStore.properties | ||
+ | -rw-rw-r-- wf/wf 315 2023-04-29 13:32 service-0.3.124-SNAPSHOT/prefixes.conf | ||
+ | -rw-rw-r-- wf/wf 2202 2023-04-29 13:32 service-0.3.124-SNAPSHOT/ldf-config.json | ||
+ | -rwxrwxr-x wf/wf 949 2023-04-29 13:32 service-0.3.124-SNAPSHOT/loadCategoryDump.sh | ||
+ | -rwxrwxr-x wf/wf 181 2023-04-29 13:32 service-0.3.124-SNAPSHOT/summarizeEvents.sh | ||
+ | -rw-rw-r-- wf/wf 2307 2023-04-29 13:32 service-0.3.124-SNAPSHOT/mwservices.json | ||
+ | -rwxrwxr-x wf/wf 2167 2023-04-29 13:32 service-0.3.124-SNAPSHOT/runStreamingUpdater.sh | ||
+ | -rw-rw-r-- wf/wf 20669767 2023-04-29 13:50 service-0.3.124-SNAPSHOT/lib/wikidata-query-tools-0.3.124-SNAPSHOT-jar-with-dependencies.jar | ||
+ | -rw-rw-r-- wf/wf 20713599 2023-04-29 13:55 service-0.3.124-SNAPSHOT/lib/streaming-updater-consumer-0.3.124-SNAPSHOT-jar-with-dependencies.jar | ||
+ | -rw-rw-r-- wf/wf 34014443 2023-04-29 13:55 service-0.3.124-SNAPSHOT/lib/streaming-updater-producer-0.3.124-SNAPSHOT-jar-with-dependencies.jar | ||
+ | -rw-rw-r-- wf/wf 6143989 2023-04-29 13:45 service-0.3.124-SNAPSHOT/lib/logging/jetty-logging-0.3.124-SNAPSHOT-jar-with-dependencies.jar | ||
+ | drwxrwxr-x wf/wf 0 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/ | ||
+ | drwxrwxr-x wf/wf 0 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/diagrams/ | ||
+ | -rw-rw-r-- wf/wf 11435 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/sparql-query-examples.md | ||
+ | -rw-rw-r-- wf/wf 9803 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/exploring-linked-data.md | ||
+ | -rw-rw-r-- wf/wf 11358 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/LICENSE.Apache | ||
+ | -rw-rw-r-- wf/wf 17986 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/LICENSE.GPL | ||
+ | -rw-rw-r-- wf/wf 877 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/diagrams/streaming-updater-components.puml | ||
+ | -rw-rw-r-- wf/wf 740 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/diagrams/streaming-updater-sequence.puml | ||
+ | -rw-rw-r-- wf/wf 1086 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/diagrams/wdqs-high-level.puml | ||
+ | -rw-rw-r-- wf/wf 1556 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/diagrams/streaming-updater-deployment.puml | ||
+ | -rw-rw-r-- wf/wf 3014 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/getting-started.md | ||
+ | -rw-rw-r-- wf/wf 341 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/TODO.md | ||
+ | -rw-rw-r-- wf/wf 1546 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/configs.md | ||
+ | -rw-rw-r-- wf/wf 476 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/Categories.md | ||
+ | -rw-rw-r-- wf/wf 79416954 2023-04-29 13:49 service-0.3.124-SNAPSHOT/blazegraph-service-0.3.124-SNAPSHOT.war | ||
+ | -rw-rw-r-- wf/wf 9809699 2023-04-29 13:56 service-0.3.124-SNAPSHOT/mw-oauth-proxy-0.3.124-SNAPSHOT.war | ||
+ | -rw-rw-r-- wf/wf 7074499 2023-04-29 13:40 service-0.3.124-SNAPSHOT/jetty-runner-9.4.12.v20180830.jar | ||
+ | </source> | ||
+ | |||
+ | === Unpack and make available via symlink === | ||
+ | <source lang='bash' highlight='1,3'> | ||
+ | tar xvfz service-0.3.124-SNAPSHOT-dist.tar.gz | ||
+ | # in target directory | ||
+ | ln -s /home/wf/wikidata-query-rdf/dist/target/service-0.3.124-SNAPSHOT service | ||
+ | </source> | ||
+ | |||
+ | == calling munge.sh == | ||
+ | === domunge.sh === | ||
+ | <source lang='bash'> | ||
+ | #!/bin/bash | ||
+ | # WF 2023-04-29 | ||
+ | # start munge in background | ||
+ | bzcat latest-all.ttl.bz2 | service/munge.sh -f - -d data -- --skolemize | ||
+ | </source> | ||
+ | === start domunge.sh and show nohup.out log === | ||
+ | <source lang='bash highlight='1-3'> | ||
+ | nohup ./domunge.sh & | ||
+ | tail -f nohup.out | ||
+ | #logback.classic pattern: %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n | ||
+ | 14:23:31.529 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO org.wikidata.query.rdf.tool.Munge - Switching to data/wikidump-000000001.ttl.gz | ||
+ | 14:24:17.795 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 10000 entities at (154, 93, 79) | ||
+ | ... | ||
+ | 17:10:05.507 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 10000000 entities at (936, 919, 1011) | ||
+ | ... | ||
+ | 19:56:14.353 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 20000000 entities at (1191, 1097, 979) | ||
+ | ... | ||
+ | 22:21:33.096 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 30000000 entities at (1228, 1136, 1221) | ||
+ | ... | ||
+ | 01:10:14.808 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 40000000 entities at (766, 764, 988) | ||
+ | ... | ||
+ | 03:55:33.809 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 50000000 entities at (786, 779, 893) | ||
+ | ... | ||
+ | 06:38:44.146 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 60000000 entities at (1343, 1272, 1081) | ||
+ | ... | ||
+ | 09:07:55.295 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 70000000 entities at (527, 804, 1053) | ||
+ | ... | ||
+ | 11:56:15.113 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 80000000 entities at (1103, 944, 993) | ||
+ | ... | ||
+ | 14:40:23.875 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 90000000 entities at (903, 850, 889) | ||
+ | ... | ||
+ | 17:19:18.387 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 100000000 entities at (1416, 1392, 1182) | ||
+ | ... | ||
+ | 18:42:24.907 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 105730000 entities at (2206, 1688, 1409) | ||
+ | </source> | ||
+ | |||
+ | === check created munge files === | ||
+ | <source lang='bash' highlight='1,7'> | ||
+ | grep gz nohup.out | cut -f9 -d" " | tail -5 | ||
+ | data/wikidump-000001054.ttl.gz | ||
+ | data/wikidump-000001055.ttl.gz | ||
+ | data/wikidump-000001056.ttl.gz | ||
+ | data/wikidump-000001057.ttl.gz | ||
+ | data/wikidump-000001058.ttl.gz | ||
+ | du -sm data/split/ | ||
+ | 109978 data/split/ | ||
+ | </source> | ||
+ | |||
+ | === inspect a sample file === | ||
+ | <source lang='bash' highlight='1'> | ||
+ | zcat data/wikidump-000000702.ttl.gz | head -40 | ||
+ | @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . | ||
+ | @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . | ||
+ | @prefix ontolex: <http://www.w3.org/ns/lemon/ontolex#> . | ||
+ | @prefix dct: <http://purl.org/dc/terms/> . | ||
+ | @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . | ||
+ | @prefix owl: <http://www.w3.org/2002/07/owl#> . | ||
+ | @prefix wikibase: <http://wikiba.se/ontology#> . | ||
+ | @prefix skos: <http://www.w3.org/2004/02/skos/core#> . | ||
+ | @prefix schema: <http://schema.org/> . | ||
+ | @prefix cc: <http://creativecommons.org/ns#> . | ||
+ | @prefix geo: <http://www.opengis.net/ont/geosparql#> . | ||
+ | @prefix prov: <http://www.w3.org/ns/prov#> . | ||
+ | @prefix wd: <http://www.wikidata.org/entity/> . | ||
+ | @prefix data: <https://www.wikidata.org/wiki/Special:EntityData/> . | ||
+ | @prefix s: <http://www.wikidata.org/entity/statement/> . | ||
+ | @prefix ref: <http://www.wikidata.org/reference/> . | ||
+ | @prefix v: <http://www.wikidata.org/value/> . | ||
+ | @prefix wdt: <http://www.wikidata.org/prop/direct/> . | ||
+ | @prefix wdtn: <http://www.wikidata.org/prop/direct-normalized/> . | ||
+ | @prefix p: <http://www.wikidata.org/prop/> . | ||
+ | @prefix ps: <http://www.wikidata.org/prop/statement/> . | ||
+ | @prefix psv: <http://www.wikidata.org/prop/statement/value/> . | ||
+ | @prefix psn: <http://www.wikidata.org/prop/statement/value-normalized/> . | ||
+ | @prefix pq: <http://www.wikidata.org/prop/qualifier/> . | ||
+ | @prefix pqv: <http://www.wikidata.org/prop/qualifier/value/> . | ||
+ | @prefix pqn: <http://www.wikidata.org/prop/qualifier/value-normalized/> . | ||
+ | @prefix pr: <http://www.wikidata.org/prop/reference/> . | ||
+ | @prefix prv: <http://www.wikidata.org/prop/reference/value/> . | ||
+ | @prefix prn: <http://www.wikidata.org/prop/reference/value-normalized/> . | ||
+ | @prefix wdno: <http://www.wikidata.org/prop/novalue/> . | ||
+ | |||
+ | <https://ceb.wikipedia.org/wiki/R%C3%ADo_Pitara> a schema:Article ; | ||
+ | schema:about wd:Q35416151 ; | ||
+ | schema:inLanguage "ceb" ; | ||
+ | schema:isPartOf <https://ceb.wikipedia.org/> ; | ||
+ | schema:name "Río Pitara"@ceb . | ||
+ | |||
+ | wd:Q35416151 wdt:P625 "Point(-67.2225 9.3544444444444)"^^geo:wktLiteral ; | ||
+ | wdt:P31 wd:Q47521 ; | ||
+ | wdt:P1566 "3630072" ; | ||
+ | </source> | ||
+ | see https://www.wikidata.org/wiki/Q35416151 | ||
+ | |||
+ | = Loading = | ||
+ | == move files to split directory == | ||
+ | We didn't quite follow the getting started - so fix the location of the munged files | ||
+ | <source lang='bash'> | ||
+ | mkdir split | ||
+ | wikidata/data$ mv wiki* split | ||
+ | </source> | ||
+ | == prepare log directory == | ||
+ | <source lang='bash'> | ||
+ | sudo mkdir -p /var/log/wdqs/ | ||
+ | sudo chown $(id -un) /var/log/wdqs/ | ||
+ | </source> | ||
+ | == mv service to TB harddisk == | ||
+ | <source lang='bash'> | ||
+ | mv service s | ||
+ | mkdir service | ||
+ | mv s/* service | ||
+ | rm s | ||
+ | </source> | ||
+ | |||
+ | == start blazegraph == | ||
+ | <source lang='bash'> | ||
+ | nohup service/runBlazegraph.sh 2>&1 > blazegraph.log& | ||
+ | </source> | ||
+ | == loadall.sh == | ||
+ | This is the script i used in 2018: | ||
+ | <source lang='bash'> | ||
+ | #!/usr/bin/env bash | ||
+ | # load all data | ||
+ | START=1 | ||
+ | END=100000 | ||
+ | FORMAT=wikidump-%09d.ttl.gz | ||
+ | LOCATION=$(pwd)/data/split | ||
+ | BASE=$(dirname $0) | ||
+ | cd $BASE | ||
+ | |||
+ | |||
+ | while getopts s:e:d:h option | ||
+ | do | ||
+ | case "${option}" | ||
+ | in | ||
+ | s) START=${OPTARG};; | ||
+ | e) END=${OPTARG};; | ||
+ | d) LOCATION=${OPTARG};; | ||
+ | h) | ||
+ | echo "Usage: $0 [-s <start>] [-e <end>] [-d <directory>] [-h]" | ||
+ | exit 1 | ||
+ | ;; | ||
+ | esac | ||
+ | done | ||
+ | |||
+ | i=$START | ||
+ | while [ $i -le $END ]; do | ||
+ | printf -v f $FORMAT $i | ||
+ | if [ -f "$LOCATION/$f.good" ] | ||
+ | then | ||
+ | echo File $LOCATION/$f already imported | ||
+ | else | ||
+ | if [ ! -f "$LOCATION/$f" ] | ||
+ | then | ||
+ | echo File $LOCATION/$f not found, terminating | ||
+ | exit 0 | ||
+ | else | ||
+ | ts=$(date -Iseconds) | ||
+ | echo Processing $f at $ts | ||
+ | ./loadRestAPI.sh -n wdq -d "$LOCATION/$f" | ||
+ | fi | ||
+ | fi | ||
+ | let i++ | ||
+ | done | ||
+ | </source> | ||
+ | |||
+ | == load == | ||
+ | <source lang='bash'> | ||
+ | nohup service/loadRestAPI.sh -n wdq -d `pwd`/data/split& | ||
+ | </source> | ||
+ | = Logfile issue = | ||
+ | The logfile size grows far too quickly with the default log settings. | ||
+ | |||
+ | For testing i reactivated my 128 GB RAM machine i used for the QLever tests last year. | ||
+ | == Download munge files == | ||
+ | <source lang='bash'> | ||
+ | cat data/split/getall | ||
+ | #!/bin/bash | ||
+ | # WF 2023-05-02 | ||
+ | base=http://wikidata.dbis.rwth-aachen.de/downloads/split/ | ||
+ | for i in {0001..1058} | ||
+ | do | ||
+ | file=wikidump-00000$i.ttl.gz | ||
+ | url=$base/wikidump-00000$i.ttl.gz | ||
+ | if [ ! -f $file ] | ||
+ | then | ||
+ | wget $url | ||
+ | else | ||
+ | echo "$file ✅" | ||
+ | fi | ||
+ | done | ||
+ | </source> | ||
+ | == Run load for a single file == | ||
+ | <source lang='bash'> | ||
+ | service/loadall.sh -s 1 -e 1 | ||
+ | Processing wikidump-000000001.ttl.gz at 2023-05-02T18:31:50+02:00 | ||
+ | Loading with properties... | ||
+ | quiet=false | ||
+ | verbose=0 | ||
+ | closure=false | ||
+ | durableQueues=true | ||
+ | #Needed for quads | ||
+ | #defaultGraph= | ||
+ | com.bigdata.rdf.store.DataLoader.flush=false | ||
+ | com.bigdata.rdf.store.DataLoader.bufferCapacity=100000 | ||
+ | com.bigdata.rdf.store.DataLoader.queueCapacity=10 | ||
+ | #Namespace to load | ||
+ | namespace=wdq | ||
+ | #Files to load | ||
+ | fileOrDirs=/hd/seel/wikidata/data/split/wikidump-000000001.ttl.gz | ||
+ | #Property file (if creating a new namespace) | ||
+ | propertyFile=/hd/seel/wikidata/service/RWStore.properties | ||
+ | </source> | ||
+ | == Check size of log file == | ||
+ | The log file size grows at 26 MByte/s - that is far too much. E.g. this smaller 128 GB testmachine only has a 2TB SSD in the first place. | ||
+ | <source lang='bash' highlight='1,4'> | ||
+ | date;du -sm blazegraph.log; | ||
+ | Di 2. Mai 18:37:00 CEST 2023 | ||
+ | 9603 blazegraph.log | ||
+ | date;du -sm blazegraph.log; | ||
+ | Di 2. Mai 18:37:02 CEST 2023 | ||
+ | 9637 blazegraph.log | ||
+ | date;du -sm blazegraph.log; | ||
+ | Di 2. Mai 18:37:51 CEST 2023 | ||
+ | 10951 blazegraph.log | ||
+ | </source> | ||
+ | == Logback.xml candidates == | ||
+ | === Generated from ERB template === | ||
+ | generated from https://github.com/wikimedia/operations-puppet/blob/production/modules/query_service/templates/logback.xml.erb: | ||
+ | * https://phabricator.wikimedia.org/P47293 | ||
+ | ==== logback.xml ==== | ||
+ | <source lang='xml'> | ||
+ | <?xml version="1.0" encoding="UTF-8"?> | ||
+ | <configuration scan="true" scanPeriod="5 minutes" packagingData="false"> | ||
+ | |||
+ | <!-- ugly trick to ensure ${HOSTNAME} is evaluated --> | ||
+ | <property scope="context" name="hostname" value="${HOSTNAME}" /> | ||
+ | |||
+ | <!-- | ||
+ | File based logs: | ||
+ | * rolling every day or when size > 100MB | ||
+ | --> | ||
+ | <appender name="file" class="ch.qos.logback.core.rolling.RollingFileAppender"> | ||
+ | <file>PATH/TO/LOGS/rdf-query-service.log</file> | ||
+ | <rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy"> | ||
+ | <!-- daily rollover --> | ||
+ | <fileNamePattern>PATH/TO/LOGS/rdf-query-service.%d{yyyy-MM-dd}.%i.log.gz</fileNamePattern> | ||
+ | <maxFileSize>100MB</maxFileSize> | ||
+ | <maxHistory>30</maxHistory> | ||
+ | </rollingPolicy> | ||
+ | <filter class="org.wikidata.query.rdf.common.log.PerLoggerThrottler" /> | ||
+ | <encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder"> | ||
+ | <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg %mdc%n%rEx{1,QUERY_TIMEOUT,SYNTAX_ERROR}</pattern> | ||
+ | <outputPatternAsHeader>true</outputPatternAsHeader> | ||
+ | </encoder> | ||
+ | </appender> | ||
+ | <appender name="async-file" class="ch.qos.logback.classic.AsyncAppender"> | ||
+ | <neverBlock>true</neverBlock> | ||
+ | <appender-ref ref="file" /> | ||
+ | </appender> | ||
+ | |||
+ | <!-- | ||
+ | Console based logs: | ||
+ | * per logger / message throttling is enabled | ||
+ | * limited to 10 messages per second | ||
+ | * level => ERROR | ||
+ | --> | ||
+ | <appender name="stdout" class="ch.qos.logback.core.ConsoleAppender"> | ||
+ | <filter class="org.wikidata.query.rdf.common.log.PerLoggerThrottler" /> | ||
+ | <filter class="org.wikidata.query.rdf.common.log.RateLimitFilter"> | ||
+ | <bucketCapacity>10</bucketCapacity> | ||
+ | <refillIntervalInMillis>1000</refillIntervalInMillis> | ||
+ | </filter> | ||
+ | <filter class="ch.qos.logback.classic.filter.ThresholdFilter"> | ||
+ | <level>error</level> | ||
+ | </filter> | ||
+ | <encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder"> | ||
+ | <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg %mdc%n%rEx{1,QUERY_TIMEOUT,SYNTAX_ERROR}</pattern> | ||
+ | <outputPatternAsHeader>true</outputPatternAsHeader> | ||
+ | </encoder> | ||
+ | </appender> | ||
+ | <appender name="async-stdout" class="ch.qos.logback.classic.AsyncAppender"> | ||
+ | <neverBlock>true</neverBlock> | ||
+ | <appender-ref ref="stdout" /> | ||
+ | </appender> | ||
+ | |||
+ | <root level="info"> | ||
+ | <appender-ref ref="async-file"/> | ||
+ | <appender-ref ref="async-stdout"/> | ||
+ | </root> | ||
+ | |||
+ | <logger name="org.wikidata.query.rdf" level="info"/> | ||
+ | <logger name="org.wikidata.query.rdf.blazegraph.inline.literal.AbstractMultiTypeExtension" level="error"/> | ||
+ | <logger name="com.bigdata" level="warn"/> | ||
+ | <logger name="com.bigdata.util.concurrent.Haltable" level="off"/> | ||
+ | <logger name="com.bigdata.rdf.internal.LexiconConfiguration" level="off"/> <!-- disabled temp. ref: T207643 --> | ||
+ | |||
+ | </configuration> | ||
+ | </source> | ||
+ | |||
+ | ==== Sample configuration ==== | ||
+ | sample from https://logback.qos.ch/manual/configuration.html: | ||
+ | <source lang='xml'> | ||
+ | <configuration> | ||
+ | |||
+ | <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender"> | ||
+ | <!-- encoders are assigned the type | ||
+ | ch.qos.logback.classic.encoder.PatternLayoutEncoder by default --> | ||
+ | <encoder> | ||
+ | <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} -%kvp- %msg%n</pattern> | ||
+ | </encoder> | ||
+ | </appender> | ||
+ | |||
+ | <root level="debug"> | ||
+ | <appender-ref ref="STDOUT" /> | ||
+ | </root> | ||
+ | </configuration> | ||
+ | </source> | ||
+ | |||
+ | == retries == | ||
+ | === with simple logback.xml according to === | ||
+ | <source lang='bash'> | ||
+ | export LOG_CONFIG=/hd/seel/wikidata/logback.xml | ||
+ | nohup service/runBlazegraph.sh 2>&1 > blazegraph.log& | ||
+ | service/loadall.sh -s 1 -e 1 | ||
+ | date;du -sm blazegraph.log | ||
+ | Di 2. Mai 19:15:40 CEST 2023 | ||
+ | 1630 blazegraph.log | ||
+ | du -sm blazegraph.log | ||
+ | Di 2. Mai 19:15:42 CEST 2023 | ||
+ | 1654 blazegraph.log | ||
+ | </source> | ||
+ | |||
+ | === with pasted logback.xml === | ||
+ | hd is seel on sun and eneco on dbis wikidata | ||
+ | <source lang='bash'> | ||
+ | pgrep -fl java | ||
+ | 66671 java | ||
+ | kill 66671 | ||
+ | rm service/wikidata.jnl | ||
+ | wget https://phab.wmfusercontent.org/file/data/cz4q6ew7kksba6k5ocyx/PHID-FILE-eevhah5inj6jl2sy3nas/basic_rdf_query_service_logback.xml | ||
+ | export LOG_CONFIG=/hd/eneco/wikidata/basic_rdf_query_service_logback.xml | ||
+ | ls -l $(echo $LOG_CONFIG) | ||
+ | -rw-rw-r-- 1 wf wf 3030 May 3 09:23 /hd/eneco/wikidata/basic_rdf_query_service_logback.xml | ||
+ | nohup service/runBlazegraph.sh 2>&1 > blazegraph.log& | ||
+ | wf@sun:/hd/seel/wikidata/data/split$ mv wikidump-000000001.ttl.gz.fail wikidump-000000001.ttl.gz | ||
+ | service/loadall.sh -s 1 -e 1 | ||
+ | # in another terminal: | ||
+ | ... | ||
+ | date;du -sm service/wikidata.jnl | ||
+ | Di 2. Mai 19:28:12 CEST 2023 | ||
+ | 1530 service/wikidata.jnl | ||
+ | date;du -sm service/wikidata.jnl | ||
+ | Di 2. Mai 19:28:15 CEST 2023 | ||
+ | 1557 service/wikidata.jnl | ||
+ | <?xml version="1.0"?><data modified="0" milliseconds="483615"/> | ||
+ | wikidump-000000001.ttl.gz.good | ||
+ | </source> | ||
+ | |||
+ | === with fixed logback.xml === | ||
+ | ==== check fix ==== | ||
+ | <source lang='bash'> | ||
+ | diff logback.xml basic_rdf_query_service_logback.xml | ||
+ | 12c12 | ||
+ | < <file>/var/log/wdqs/rdf-query-service.log</file> | ||
+ | --- | ||
+ | > <file>PATH/TO/LOGS/rdf-query-service.log</file> | ||
+ | 15c15 | ||
+ | < <fileNamePattern>/var/log/wdqs/rdf-query-service.%d{yyyy-MM-dd}.%i.log.gz</fileNamePattern> | ||
+ | --- | ||
+ | > <fileNamePattern>PATH/TO/LOGS/rdf-query-service.%d{yyyy-MM-dd}.%i.log.gz</fileNamePattern> | ||
+ | </source> | ||
+ | ==== restart blazegraph ==== | ||
+ | <source lang='bash'> | ||
+ | pgrep -fla blazegraph-service | ||
+ | pkill -f blazegraph-service | ||
+ | rm blazegraph.log | ||
+ | export LOG_CONFIG=/hd/eneco/wikidata/logback.xml | ||
+ | nohup service/runBlazegraph.sh 2>&1 > blazegraph.log& | ||
+ | ls -l service/wikidata.jnl | ||
+ | -rw-rw-r-- 1 wf wf 209715200 May 3 09:41 service/wikidata.jnl | ||
+ | </source> | ||
+ | |||
+ | |||
+ | = Progress = | ||
+ | == Count triples in blazegraph == | ||
+ | |||
+ | === sparql query === | ||
+ | <source lang='sparql'> | ||
+ | SELECT ( COUNT( * ) AS ?count ) { ?s ?p ?o } | ||
+ | </source> | ||
+ | |||
+ | === queries.yaml === | ||
+ | in $HOME/.pylodstorage | ||
+ | |||
+ | <source lang'yaml'> | ||
+ | 'triplecount': | ||
+ | sparql: | | ||
+ | SELECT ( COUNT( * ) AS ?count ) { ?s ?p ?o } | ||
+ | </source> | ||
+ | === endpoints.yaml === | ||
+ | <source lang'yaml'> | ||
+ | # SPARQL endpoints for sparqlquery tool | ||
+ | # 2023-05-03 WF | ||
+ | 'wdimport': | ||
+ | endpoint: http://localhost:9999/bigdata/namespace/wdq/sparql | ||
+ | website: http://blazegraph.wikidata.dbis.rwth-aachen.de | ||
+ | database: blazegraph | ||
+ | lang: sparql | ||
+ | prefixes: | | ||
+ | PREFIX bd: <http://www.bigdata.com/rdf#> | ||
+ | PREFIX cc: <http://creativecommons.org/ns#> | ||
+ | PREFIX dct: <http://purl.org/dc/terms/> | ||
+ | PREFIX geo: <http://www.opengis.net/ont/geosparql#> | ||
+ | PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#> | ||
+ | PREFIX owl: <http://www.w3.org/2002/07/owl#> | ||
+ | PREFIX p: <http://www.wikidata.org/prop/> | ||
+ | PREFIX pq: <http://www.wikidata.org/prop/qualifier/> | ||
+ | PREFIX pqn: <http://www.wikidata.org/prop/qualifier/value-normalized/> | ||
+ | PREFIX pqv: <http://www.wikidata.org/prop/qualifier/value/> | ||
+ | PREFIX pr: <http://www.wikidata.org/prop/reference/> | ||
+ | PREFIX prn: <http://www.wikidata.org/prop/reference/value-normalized/> | ||
+ | PREFIX prov: <http://www.w3.org/ns/prov#> | ||
+ | PREFIX prv: <http://www.wikidata.org/prop/reference/value/> | ||
+ | PREFIX ps: <http://www.wikidata.org/prop/statement/> | ||
+ | PREFIX psn: <http://www.wikidata.org/prop/statement/value-normalized/> | ||
+ | PREFIX psv: <http://www.wikidata.org/prop/statement/value/> | ||
+ | PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> | ||
+ | PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> | ||
+ | PREFIX schema: <http://schema.org/> | ||
+ | PREFIX skos: <http://www.w3.org/2004/02/skos/core#> | ||
+ | PREFIX wd: <http://www.wikidata.org/entity/> | ||
+ | PREFIX wdata: <http://www.wikidata.org/wiki/Special:EntityData/> | ||
+ | PREFIX wdno: <http://www.wikidata.org/prop/novalue/> | ||
+ | PREFIX wdref: <http://www.wikidata.org/reference/> | ||
+ | PREFIX wds: <http://www.wikidata.org/entity/statement/> | ||
+ | PREFIX wdt: <http://www.wikidata.org/prop/direct/> | ||
+ | PREFIX wdtn: <http://www.wikidata.org/prop/direct-normalized/> | ||
+ | PREFIX wdv: <http://www.wikidata.org/value/> | ||
+ | PREFIX wikibase: <http://wikiba.se/ontology#> | ||
+ | PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> | ||
+ | </source> | ||
+ | |||
+ | === test sparqlquery command line tool === | ||
+ | <source lang='bash'> | ||
+ | sparqlquery -qn triplecount -en wdimport -f json | ||
+ | [ | ||
+ | { | ||
+ | "count": 457000496 | ||
+ | } | ||
+ | ] | ||
+ | </source> | ||
+ | |||
+ | == stats / ETA script == | ||
+ | using https://pypi.org/project/pylodstorage/ sparqlquery command line | ||
+ | <source lang='bash'> | ||
+ | #!/bin/bash | ||
+ | # WF 2023-05-03 | ||
+ | # statistics for wikidata loading | ||
+ | log=/tmp/log$$ | ||
+ | cat loadall.log > $log | ||
+ | cat nohup.out >>$log | ||
+ | isodate=$(date -u +"%Y-%m-%dT%H:%M:%SZ") | ||
+ | triples=$(sparqlquery -en wdimport -qn triplecount -f json | jq ".[].count") | ||
+ | cat $log | awk -v isodate=$isodate -v total_files=1058 -v triples=$triples ' | ||
+ | BEGIN { | ||
+ | FS="=" | ||
+ | printf(" #: load s total s avg s ETA h\n"); | ||
+ | } | ||
+ | /fileOrDirs=/ { | ||
+ | filenum=get_number($2) | ||
+ | next | ||
+ | } | ||
+ | /<data modified/ { | ||
+ | msecs=get_number($4) | ||
+ | sec=msecs/1000 | ||
+ | totals+=sec | ||
+ | avg=totals/filenum | ||
+ | eta=(total_files-filenum)*avg/3600 | ||
+ | printf("%4d: %6d %8d %6.0f %8.1f\n",filenum,sec,totals,avg,eta); | ||
+ | } | ||
+ | function get_number(num_str) { | ||
+ | if (match(num_str, /[0-9]+/)) { | ||
+ | num=substr(num_str, RSTART, RLENGTH) | ||
+ | return num | ||
+ | } | ||
+ | } | ||
+ | END { | ||
+ | printf("%s:%d\n",isodate,triples) | ||
+ | printf("%6.3f bill triples %6.0f triples/s\n",triples/1000000000,triples/totals) | ||
+ | } | ||
+ | ' | ||
+ | rm $log | ||
+ | </source> | ||
+ | |||
+ | === progress on wikidata dbis === | ||
+ | Triples: | ||
+ | <pre> | ||
+ | 2023-05-03T09:38:00Z: 288488184 | ||
+ | 2023-05-03T11:22:52Z: 476800190 | ||
+ | 2023-05-03T16:17:41Z: 874762589 | ||
+ | 2023-05-04T05:51:17Z:1566635203 | ||
+ | 2023-05-05T04:48:47Z:2320966288 | ||
+ | 2023-05-06T05:20:33Z:2921548912 | ||
+ | 2023-05-07T06:59:52Z:3406082541 | ||
+ | 2023-05-08T03:32:44Z:3750926849 | ||
+ | 2023-05-09T04:44:11Z:4004639109 | ||
+ | 2023-05-11T04:01:02Z:439241844 | ||
+ | recent: 2278 triples/s ETA: 52.5 days | ||
+ | </pre> | ||
+ | <source lang='bash'> | ||
+ | ./stats | ||
+ | #: load s total s avg s ETA h | ||
+ | 1: 331 331 331 97.2 | ||
+ | 2: 355 686 343 100.7 | ||
+ | 3: 394 1081 360 105.6 | ||
+ | 4: 445 1526 382 111.8 | ||
+ | 5: 404 1931 386 113.0 | ||
+ | 6: 387 2319 387 113.0 | ||
+ | 7: 401 2720 389 113.5 | ||
+ | 8: 426 3147 393 114.7 | ||
+ | 9: 435 3582 398 116.0 | ||
+ | 10: 200 3782 378 110.1 | ||
+ | 11: 201 3984 362 105.4 | ||
+ | 12: 213 4198 350 101.7 | ||
+ | 13: 318 4517 347 100.9 | ||
+ | 14: 278 4795 343 99.3 | ||
+ | 15: 232 5028 335 97.1 | ||
+ | 16: 160 5188 324 93.9 | ||
+ | 17: 372 5560 327 94.6 | ||
+ | 18: 425 5986 333 96.1 | ||
+ | 19: 421 6407 337 97.3 | ||
+ | 20: 359 6767 338 97.6 | ||
+ | 21: 260 7027 335 96.4 | ||
+ | 22: 311 7339 334 96.0 | ||
+ | 23: 308 7647 333 95.6 | ||
+ | 24: 290 7938 331 95.0 | ||
+ | 25: 375 8314 333 95.4 | ||
+ | 26: 274 8589 330 94.7 | ||
+ | 27: 266 8855 328 93.9 | ||
+ | 28: 432 9287 332 94.9 | ||
+ | 29: 275 9563 330 94.3 | ||
+ | 30: 210 9773 326 93.0 | ||
+ | 31: 549 10323 333 95.0 | ||
+ | 32: 629 10952 342 97.5 | ||
+ | 33: 575 11527 349 99.5 | ||
+ | 34: 429 11957 352 100.0 | ||
+ | 35: 627 12584 360 102.2 | ||
+ | 36: 317 12902 358 101.7 | ||
+ | 37: 180 13082 354 100.3 | ||
+ | ... | ||
+ | 54: 1060 30615 567 158.1 | ||
+ | ... | ||
+ | 106: 2458 77958 735 194.5 | ||
+ | ... | ||
+ | 170: 929 160254 943 232.5 | ||
+ | ... | ||
+ | 200: 1756 250171 1251 298.1 | ||
+ | ... | ||
+ | 239: 3384 339178 1419 322.9 | ||
+ | ... | ||
+ | 291: 3704 503140 1729 368.4 | ||
+ | ... | ||
+ | 311: 8835 673480 2166 449.3 | ||
</source> | </source> |
Latest revision as of 07:52, 15 May 2023
Import
Import | |
---|---|
edit | |
state | ❌ |
url | https://wiki.bitplan.com/index.php/Wikidata_Import_2023-04-26 |
target | blazegraph |
start | 2023-04-26 |
end | |
days | |
os | Ubuntu 22.04.2 LTS |
cpu | Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz |
ram | 256 |
triples | 14.7 |
comment | target disk is a rotating disk which is an order of magnitude slower see https://github.com/blazegraph/database/wiki/IOOptimization |
Download ~9 hours
Download Options
https://dumps.wikimedia.org/wikidatawiki/entities
latest-all.json.bz2 19-Apr-2023 19:01 81437052900 latest-all.json.gz 26-Apr-2023 06:43 123717867013 latest-all.nt.bz2 20-Apr-2023 09:17 158037435620 latest-all.nt.gz 19-Apr-2023 15:33 204694424758 latest-all.ttl.bz2 19-Apr-2023 20:37 101383518288 latest-all.ttl.gz 26-Apr-2023 08:18 123942927864 latest-lexemes.json.bz2 26-Apr-2023 03:51 297892886 latest-lexemes.json.gz 26-Apr-2023 03:49 407135019 latest-lexemes.nt.bz2 21-Apr-2023 23:33 768095633 latest-lexemes.nt.gz 21-Apr-2023 23:28 1008192049 latest-lexemes.ttl.bz2 21-Apr-2023 23:29 433401231 latest-lexemes.ttl.gz 21-Apr-2023 23:25 540610049 latest-truthy.nt.bz2 21-Apr-2023 17:41 35992719959 latest-truthy.nt.gz 21-Apr-2023 14:24 59704444949
download result
ls -l latest*.gz
-rw-rw-r-- 1 wf wf 123942927864 Apr 26 10:18 latest-all.ttl.gz
-rw-rw-r-- 1 wf wf 540610049 Apr 22 01:25 latest-lexemes.ttl.gz
download script
cat download.sh
#/bin/bash
# WF 2023-04-26
# download wikidata dumps
baseurl=https://dumps.wikimedia.org/wikidatawiki/entities/
for file in latest-all latest-lexemes
do
for ext in ttl.gz ttl.bz2
do
url=$baseurl/$file.$ext
log=$file-$ext.log
nohup wget $url >> $log&
done
done
download logs
latest-all.ttl.gz 123942927864 8h52m ✓
--2023-04-26 15:38:37-- https://dumps.wikimedia.org/wikidatawiki/entities//latest-all.ttl.gz
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 208.80.154.142, 2620:0:861:2:208:80:154:142
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.154.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 123942927864 (115G) [application/octet-stream]
Saving to: ‘latest-all.ttl.gz’
0K .......... .......... .......... .......... .......... 0% 335K 4d4h
50K .......... .......... .......... .......... .......... 0% 220K 5d6h
100K .......... .......... .......... .......... .......... 0% 438K 4d14h
...
121037950K .......... .......... .......... .......... .......... 99% 3.91M 0s
121038000K .......... ..... 100% 181M=8h52m
2023-04-27 00:31:28 (3.70 MB/s) - ‘latest-all.ttl.gz’ saved [123942927864/123942927864]
latest-all.ttl.bz2 101383518288 7h27m ✓
--2023-04-26 15:38:37-- https://dumps.wikimedia.org/wikidatawiki/entities//latest-all.ttl.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 208.80.154.142, 2620:0:861:2:208:80:154:142
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.154.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 101383518288 (94G) [application/octet-stream]
Saving to: ‘latest-all.ttl.bz2’
0K .......... .......... .......... .......... .......... 0% 219K 5d5h
50K .......... .......... .......... .......... .......... 0% 219K 5d5h
100K .......... .......... .......... .......... .......... 0% 437K 4d8h
99007250K .......... .......... .......... .......... .......... 99% 2.17M 0s
99007300K .......... .......... .......... .......... .. 100% 2.45M=7h27m
2023-04-26 23:06:17 (3.60 MB/s) - ‘latest-all.ttl.bz2’ saved [101383518288/101383518288]
latest-lexemes.ttl.gz 540610049 2m1s ✓
--2023-04-26 15:38:37-- https://dumps.wikimedia.org/wikidatawiki/entities//latest-lexemes.ttl.gz
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 208.80.154.142, 2620:0:861:2:208:80:154:142
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.154.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 540610049 (516M) [application/octet-stream]
Saving to: ‘latest-lexemes.ttl.gz’
0K .......... .......... .......... .......... .......... 0% 355K 24m45s
50K .......... .......... .......... .......... .......... 0% 209K 33m23s
100K .......... .......... .......... .......... .......... 0% 416K 29m18s
527850K .......... .......... .......... .......... .......... 99% 62.1M 0s
527900K .......... .......... .......... ......... 100% 23.7M=2m1s
2023-04-26 15:40:39 (4.27 MB/s) - ‘latest-lexemes.ttl.gz’ saved [540610049/540610049]
latest-lexemes.ttl.bz2 43340123 1m45s ✓
attempt by script ❌
--2023-04-26 15:38:37-- https://dumps.wikimedia.org/wikidatawiki/entities//latest-lexemes.ttl.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 208.80.154.142, 2620:0:861:2:208:80:154:142
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.154.142|:443... connected.
HTTP request sent, awaiting response... 503 Service Temporarily Unavailable
2023-04-26 15:38:38 ERROR 503: Service Temporarily Unavailable.
manual retry latest-lexemes.ttl.bz2
wget https://dumps.wikimedia.org/wikidatawiki/entities/latest-lexemes.ttl.bz2
--2023-04-28 13:44:11-- https://dumps.wikimedia.org/wikidatawiki/entities/latest-lexemes.ttl.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 208.80.154.142, 2620:0:861:2:208:80:154:142
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.154.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 433401231 (413M) [application/octet-stream]
Saving to: ‘latest-lexemes.ttl.bz2’
latest-lexemes.ttl. 9%[> ] 38.66M 4.19MB/s eta 1m 53
latest-lexemes.ttl. 100%[===================>] 413.32M 4.25MB/s in 1m 45s
2023-04-28 13:45:57 (3.93 MB/s) - ‘latest-lexemes.ttl.bz2’ saved [433401231/433401231]
Munging ~29 h
- https://github.com/wikimedia/wikidata-query-rdf/blob/master/docs/getting-started.md
- https://github.com/wikimedia/wikidata-query-deploy/blob/master/munge.sh
Preparation ~20-30 min
Needed installs and settings
sudo apt install openjdk-11-jdk-headless
sudo apt install maven
export JAVA_HOME=$(update-alternatives --query javadoc | grep Value: | head -n1 | sed 's/Value: //' | sed 's@bin/javadoc$@@')
echo $JAVA_HOME
/usr/lib/jvm/java-11-openjdk-amd64/
clone and package
had to start mvn package twice since javadoc was not available JAVA_HOME was not set on first try
git clone https://gerrit.wikimedia.org/r/wikidata/query/rdf wikidata-query-rdf
Cloning into 'wikidata-query-rdf'...
remote: Counting objects: 111, done
remote: Total 26684 (delta 0), reused 26684 (delta 0)
Receiving objects: 100% (26684/26684), 4.84 MiB | 3.23 MiB/s, done.
Resolving deltas: 100% (13928/13928), done.
cd wikidata-query-rdf/
mvn package
[INFO] Building jar: /home/wf/wikidata-query-rdf/common/target/wikidata-query-common-0.3.124-SNAPSHOT.jar
[INFO]
[INFO] --- maven-javadoc-plugin:3.2.0:jar (attach-javadocs) @ common ---
[INFO] ------------------------------------------------------------------------
[INFO] Reactor Summary for Wikidata Query Service 0.3.124-SNAPSHOT:
[INFO]
[INFO] Wikidata Query Service ............................. SUCCESS [ 19.204 s]
...
[INFO] --- maven-assembly-plugin:3.3.0:single (default) @ service ---
[INFO] Reading assembly descriptor: src/assembly/dist.xml
[INFO] Building tar: /home/wf/wikidata-query-rdf/dist/target/service-0.3.124-SNAPSHOT-dist.tar.gz
[INFO] ------------------------------------------------------------------------
[INFO] Reactor Summary for Wikidata Query Service 0.3.124-SNAPSHOT:
[INFO]
[INFO] Wikidata Query Service ............................. SUCCESS [ 1.904 s]
[INFO] Shared code ........................................ SUCCESS [ 4.798 s]
[INFO] Wikidata Query RDF Testing Tools ................... SUCCESS [ 15.411 s]
[INFO] Jetty logging dependencies ......................... SUCCESS [ 37.577 s]
[INFO] Blazegraph extension to improve performance for Wikibase SUCCESS [03:14 min]
[INFO] Blazegraph Service Package ......................... SUCCESS [ 33.788 s]
[INFO] Wikidata Query RDF Tools ........................... SUCCESS [01:13 min]
[INFO] Wikidata Query Service Streaming Updater - Common .. SUCCESS [ 4.758 s]
[INFO] Wikidata Query Service Streaming Updater - Producer SUCCESS [04:34 min]
[INFO] Wikidata Query Service Streaming Updater - Consumer SUCCESS [ 7.982 s]
[INFO] MediaWiki OAuth 1.0a Proxy Service ................. SUCCESS [ 37.591 s]
[INFO] rdf-spark-tools .................................... SUCCESS [09:44 min]
[INFO] Wikibase RDF Query Service ......................... SUCCESS [ 10.378 s]
[INFO] ------------------------------------------------------------------------
[INFO] BUILD SUCCESS
[INFO] ------------------------------------------------------------------------
[INFO] Total time: 21:21 min
[INFO] Finished at: 2023-04-29T14:05:59+02:00
[INFO] ------------------------------------------------------------------------
check dist
cd dist/target
~/wikidata-query-rdf/dist/target$ tar tvfz service-0.3.124-SNAPSHOT-dist.tar.gz
drwxrwxr-x wf/wf 0 2023-04-29 13:32 service-0.3.124-SNAPSHOT/
-rw-rw-r-- wf/wf 1170 2023-04-29 13:32 service-0.3.124-SNAPSHOT/prefixes-sdc.conf
-rwxrwxr-x wf/wf 1277 2023-04-29 13:32 service-0.3.124-SNAPSHOT/wcqs-data-reload.sh
-rwxrwxr-x wf/wf 2656 2023-04-29 13:32 service-0.3.124-SNAPSHOT/runUpdate.sh
-rwxrwxr-x wf/wf 599 2023-04-29 13:32 service-0.3.124-SNAPSHOT/createNamespace.sh
-rw-rw-r-- wf/wf 1483 2023-04-29 13:32 service-0.3.124-SNAPSHOT/default.properties
-rwxrwxr-x wf/wf 6470 2023-04-29 13:32 service-0.3.124-SNAPSHOT/runBlazegraph.sh
-rwxrwxr-x wf/wf 1345 2023-04-29 13:32 service-0.3.124-SNAPSHOT/loadRestAPI.sh
-rwxrwxr-x wf/wf 490 2023-04-29 13:32 service-0.3.124-SNAPSHOT/forAllCategoryWikis.sh
-rwxrwxr-x wf/wf 857 2023-04-29 13:32 service-0.3.124-SNAPSHOT/munge.sh
-rwxrwxr-x wf/wf 1133 2023-04-29 13:32 service-0.3.124-SNAPSHOT/loadCategoryDaily.sh
-rwxrwxr-x wf/wf 882 2023-04-29 13:32 service-0.3.124-SNAPSHOT/loadData.sh
-rw-rw-r-- wf/wf 3412 2023-04-29 13:32 service-0.3.124-SNAPSHOT/RWStore.properties
-rw-rw-r-- wf/wf 315 2023-04-29 13:32 service-0.3.124-SNAPSHOT/prefixes.conf
-rw-rw-r-- wf/wf 2202 2023-04-29 13:32 service-0.3.124-SNAPSHOT/ldf-config.json
-rwxrwxr-x wf/wf 949 2023-04-29 13:32 service-0.3.124-SNAPSHOT/loadCategoryDump.sh
-rwxrwxr-x wf/wf 181 2023-04-29 13:32 service-0.3.124-SNAPSHOT/summarizeEvents.sh
-rw-rw-r-- wf/wf 2307 2023-04-29 13:32 service-0.3.124-SNAPSHOT/mwservices.json
-rwxrwxr-x wf/wf 2167 2023-04-29 13:32 service-0.3.124-SNAPSHOT/runStreamingUpdater.sh
-rw-rw-r-- wf/wf 20669767 2023-04-29 13:50 service-0.3.124-SNAPSHOT/lib/wikidata-query-tools-0.3.124-SNAPSHOT-jar-with-dependencies.jar
-rw-rw-r-- wf/wf 20713599 2023-04-29 13:55 service-0.3.124-SNAPSHOT/lib/streaming-updater-consumer-0.3.124-SNAPSHOT-jar-with-dependencies.jar
-rw-rw-r-- wf/wf 34014443 2023-04-29 13:55 service-0.3.124-SNAPSHOT/lib/streaming-updater-producer-0.3.124-SNAPSHOT-jar-with-dependencies.jar
-rw-rw-r-- wf/wf 6143989 2023-04-29 13:45 service-0.3.124-SNAPSHOT/lib/logging/jetty-logging-0.3.124-SNAPSHOT-jar-with-dependencies.jar
drwxrwxr-x wf/wf 0 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/
drwxrwxr-x wf/wf 0 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/diagrams/
-rw-rw-r-- wf/wf 11435 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/sparql-query-examples.md
-rw-rw-r-- wf/wf 9803 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/exploring-linked-data.md
-rw-rw-r-- wf/wf 11358 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/LICENSE.Apache
-rw-rw-r-- wf/wf 17986 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/LICENSE.GPL
-rw-rw-r-- wf/wf 877 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/diagrams/streaming-updater-components.puml
-rw-rw-r-- wf/wf 740 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/diagrams/streaming-updater-sequence.puml
-rw-rw-r-- wf/wf 1086 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/diagrams/wdqs-high-level.puml
-rw-rw-r-- wf/wf 1556 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/diagrams/streaming-updater-deployment.puml
-rw-rw-r-- wf/wf 3014 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/getting-started.md
-rw-rw-r-- wf/wf 341 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/TODO.md
-rw-rw-r-- wf/wf 1546 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/configs.md
-rw-rw-r-- wf/wf 476 2023-04-29 13:32 service-0.3.124-SNAPSHOT/docs/Categories.md
-rw-rw-r-- wf/wf 79416954 2023-04-29 13:49 service-0.3.124-SNAPSHOT/blazegraph-service-0.3.124-SNAPSHOT.war
-rw-rw-r-- wf/wf 9809699 2023-04-29 13:56 service-0.3.124-SNAPSHOT/mw-oauth-proxy-0.3.124-SNAPSHOT.war
-rw-rw-r-- wf/wf 7074499 2023-04-29 13:40 service-0.3.124-SNAPSHOT/jetty-runner-9.4.12.v20180830.jar
Unpack and make available via symlink
tar xvfz service-0.3.124-SNAPSHOT-dist.tar.gz
# in target directory
ln -s /home/wf/wikidata-query-rdf/dist/target/service-0.3.124-SNAPSHOT service
calling munge.sh
domunge.sh
#!/bin/bash
# WF 2023-04-29
# start munge in background
bzcat latest-all.ttl.bz2 | service/munge.sh -f - -d data -- --skolemize
start domunge.sh and show nohup.out log
nohup ./domunge.sh &
tail -f nohup.out
#logback.classic pattern: %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
14:23:31.529 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO org.wikidata.query.rdf.tool.Munge - Switching to data/wikidump-000000001.ttl.gz
14:24:17.795 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 10000 entities at (154, 93, 79)
...
17:10:05.507 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 10000000 entities at (936, 919, 1011)
...
19:56:14.353 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 20000000 entities at (1191, 1097, 979)
...
22:21:33.096 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 30000000 entities at (1228, 1136, 1221)
...
01:10:14.808 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 40000000 entities at (766, 764, 988)
...
03:55:33.809 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 50000000 entities at (786, 779, 893)
...
06:38:44.146 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 60000000 entities at (1343, 1272, 1081)
...
09:07:55.295 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 70000000 entities at (527, 804, 1053)
...
11:56:15.113 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 80000000 entities at (1103, 944, 993)
...
14:40:23.875 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 90000000 entities at (903, 850, 889)
...
17:19:18.387 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 100000000 entities at (1416, 1392, 1182)
...
18:42:24.907 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO o.w.q.r.t.r.EntityMungingRdfHandler - Processed 105730000 entities at (2206, 1688, 1409)
check created munge files
grep gz nohup.out | cut -f9 -d" " | tail -5
data/wikidump-000001054.ttl.gz
data/wikidump-000001055.ttl.gz
data/wikidump-000001056.ttl.gz
data/wikidump-000001057.ttl.gz
data/wikidump-000001058.ttl.gz
du -sm data/split/
109978 data/split/
inspect a sample file
zcat data/wikidump-000000702.ttl.gz | head -40
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix ontolex: <http://www.w3.org/ns/lemon/ontolex#> .
@prefix dct: <http://purl.org/dc/terms/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix wikibase: <http://wikiba.se/ontology#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix schema: <http://schema.org/> .
@prefix cc: <http://creativecommons.org/ns#> .
@prefix geo: <http://www.opengis.net/ont/geosparql#> .
@prefix prov: <http://www.w3.org/ns/prov#> .
@prefix wd: <http://www.wikidata.org/entity/> .
@prefix data: <https://www.wikidata.org/wiki/Special:EntityData/> .
@prefix s: <http://www.wikidata.org/entity/statement/> .
@prefix ref: <http://www.wikidata.org/reference/> .
@prefix v: <http://www.wikidata.org/value/> .
@prefix wdt: <http://www.wikidata.org/prop/direct/> .
@prefix wdtn: <http://www.wikidata.org/prop/direct-normalized/> .
@prefix p: <http://www.wikidata.org/prop/> .
@prefix ps: <http://www.wikidata.org/prop/statement/> .
@prefix psv: <http://www.wikidata.org/prop/statement/value/> .
@prefix psn: <http://www.wikidata.org/prop/statement/value-normalized/> .
@prefix pq: <http://www.wikidata.org/prop/qualifier/> .
@prefix pqv: <http://www.wikidata.org/prop/qualifier/value/> .
@prefix pqn: <http://www.wikidata.org/prop/qualifier/value-normalized/> .
@prefix pr: <http://www.wikidata.org/prop/reference/> .
@prefix prv: <http://www.wikidata.org/prop/reference/value/> .
@prefix prn: <http://www.wikidata.org/prop/reference/value-normalized/> .
@prefix wdno: <http://www.wikidata.org/prop/novalue/> .
<https://ceb.wikipedia.org/wiki/R%C3%ADo_Pitara> a schema:Article ;
schema:about wd:Q35416151 ;
schema:inLanguage "ceb" ;
schema:isPartOf <https://ceb.wikipedia.org/> ;
schema:name "Río Pitara"@ceb .
wd:Q35416151 wdt:P625 "Point(-67.2225 9.3544444444444)"^^geo:wktLiteral ;
wdt:P31 wd:Q47521 ;
wdt:P1566 "3630072" ;
see https://www.wikidata.org/wiki/Q35416151
Loading
move files to split directory
We didn't quite follow the getting started - so fix the location of the munged files
mkdir split
wikidata/data$ mv wiki* split
prepare log directory
sudo mkdir -p /var/log/wdqs/
sudo chown $(id -un) /var/log/wdqs/
mv service to TB harddisk
mv service s
mkdir service
mv s/* service
rm s
start blazegraph
nohup service/runBlazegraph.sh 2>&1 > blazegraph.log&
loadall.sh
This is the script i used in 2018:
#!/usr/bin/env bash
# load all data
START=1
END=100000
FORMAT=wikidump-%09d.ttl.gz
LOCATION=$(pwd)/data/split
BASE=$(dirname $0)
cd $BASE
while getopts s:e:d:h option
do
case "${option}"
in
s) START=${OPTARG};;
e) END=${OPTARG};;
d) LOCATION=${OPTARG};;
h)
echo "Usage: $0 [-s <start>] [-e <end>] [-d <directory>] [-h]"
exit 1
;;
esac
done
i=$START
while [ $i -le $END ]; do
printf -v f $FORMAT $i
if [ -f "$LOCATION/$f.good" ]
then
echo File $LOCATION/$f already imported
else
if [ ! -f "$LOCATION/$f" ]
then
echo File $LOCATION/$f not found, terminating
exit 0
else
ts=$(date -Iseconds)
echo Processing $f at $ts
./loadRestAPI.sh -n wdq -d "$LOCATION/$f"
fi
fi
let i++
done
load
nohup service/loadRestAPI.sh -n wdq -d `pwd`/data/split&
Logfile issue
The logfile size grows far too quickly with the default log settings.
For testing i reactivated my 128 GB RAM machine i used for the QLever tests last year.
Download munge files
cat data/split/getall
#!/bin/bash
# WF 2023-05-02
base=http://wikidata.dbis.rwth-aachen.de/downloads/split/
for i in {0001..1058}
do
file=wikidump-00000$i.ttl.gz
url=$base/wikidump-00000$i.ttl.gz
if [ ! -f $file ]
then
wget $url
else
echo "$file ✅"
fi
done
Run load for a single file
service/loadall.sh -s 1 -e 1
Processing wikidump-000000001.ttl.gz at 2023-05-02T18:31:50+02:00
Loading with properties...
quiet=false
verbose=0
closure=false
durableQueues=true
#Needed for quads
#defaultGraph=
com.bigdata.rdf.store.DataLoader.flush=false
com.bigdata.rdf.store.DataLoader.bufferCapacity=100000
com.bigdata.rdf.store.DataLoader.queueCapacity=10
#Namespace to load
namespace=wdq
#Files to load
fileOrDirs=/hd/seel/wikidata/data/split/wikidump-000000001.ttl.gz
#Property file (if creating a new namespace)
propertyFile=/hd/seel/wikidata/service/RWStore.properties
Check size of log file
The log file size grows at 26 MByte/s - that is far too much. E.g. this smaller 128 GB testmachine only has a 2TB SSD in the first place.
date;du -sm blazegraph.log;
Di 2. Mai 18:37:00 CEST 2023
9603 blazegraph.log
date;du -sm blazegraph.log;
Di 2. Mai 18:37:02 CEST 2023
9637 blazegraph.log
date;du -sm blazegraph.log;
Di 2. Mai 18:37:51 CEST 2023
10951 blazegraph.log
Logback.xml candidates
Generated from ERB template
generated from https://github.com/wikimedia/operations-puppet/blob/production/modules/query_service/templates/logback.xml.erb:
logback.xml
<?xml version="1.0" encoding="UTF-8"?>
<configuration scan="true" scanPeriod="5 minutes" packagingData="false">
<!-- ugly trick to ensure ${HOSTNAME} is evaluated -->
<property scope="context" name="hostname" value="${HOSTNAME}" />
<!--
File based logs:
* rolling every day or when size > 100MB
-->
<appender name="file" class="ch.qos.logback.core.rolling.RollingFileAppender">
<file>PATH/TO/LOGS/rdf-query-service.log</file>
<rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy">
<!-- daily rollover -->
<fileNamePattern>PATH/TO/LOGS/rdf-query-service.%d{yyyy-MM-dd}.%i.log.gz</fileNamePattern>
<maxFileSize>100MB</maxFileSize>
<maxHistory>30</maxHistory>
</rollingPolicy>
<filter class="org.wikidata.query.rdf.common.log.PerLoggerThrottler" />
<encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg %mdc%n%rEx{1,QUERY_TIMEOUT,SYNTAX_ERROR}</pattern>
<outputPatternAsHeader>true</outputPatternAsHeader>
</encoder>
</appender>
<appender name="async-file" class="ch.qos.logback.classic.AsyncAppender">
<neverBlock>true</neverBlock>
<appender-ref ref="file" />
</appender>
<!--
Console based logs:
* per logger / message throttling is enabled
* limited to 10 messages per second
* level => ERROR
-->
<appender name="stdout" class="ch.qos.logback.core.ConsoleAppender">
<filter class="org.wikidata.query.rdf.common.log.PerLoggerThrottler" />
<filter class="org.wikidata.query.rdf.common.log.RateLimitFilter">
<bucketCapacity>10</bucketCapacity>
<refillIntervalInMillis>1000</refillIntervalInMillis>
</filter>
<filter class="ch.qos.logback.classic.filter.ThresholdFilter">
<level>error</level>
</filter>
<encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg %mdc%n%rEx{1,QUERY_TIMEOUT,SYNTAX_ERROR}</pattern>
<outputPatternAsHeader>true</outputPatternAsHeader>
</encoder>
</appender>
<appender name="async-stdout" class="ch.qos.logback.classic.AsyncAppender">
<neverBlock>true</neverBlock>
<appender-ref ref="stdout" />
</appender>
<root level="info">
<appender-ref ref="async-file"/>
<appender-ref ref="async-stdout"/>
</root>
<logger name="org.wikidata.query.rdf" level="info"/>
<logger name="org.wikidata.query.rdf.blazegraph.inline.literal.AbstractMultiTypeExtension" level="error"/>
<logger name="com.bigdata" level="warn"/>
<logger name="com.bigdata.util.concurrent.Haltable" level="off"/>
<logger name="com.bigdata.rdf.internal.LexiconConfiguration" level="off"/> <!-- disabled temp. ref: T207643 -->
</configuration>
Sample configuration
sample from https://logback.qos.ch/manual/configuration.html:
<configuration>
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
<!-- encoders are assigned the type
ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
<encoder>
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} -%kvp- %msg%n</pattern>
</encoder>
</appender>
<root level="debug">
<appender-ref ref="STDOUT" />
</root>
</configuration>
retries
with simple logback.xml according to
export LOG_CONFIG=/hd/seel/wikidata/logback.xml
nohup service/runBlazegraph.sh 2>&1 > blazegraph.log&
service/loadall.sh -s 1 -e 1
date;du -sm blazegraph.log
Di 2. Mai 19:15:40 CEST 2023
1630 blazegraph.log
du -sm blazegraph.log
Di 2. Mai 19:15:42 CEST 2023
1654 blazegraph.log
with pasted logback.xml
hd is seel on sun and eneco on dbis wikidata
pgrep -fl java
66671 java
kill 66671
rm service/wikidata.jnl
wget https://phab.wmfusercontent.org/file/data/cz4q6ew7kksba6k5ocyx/PHID-FILE-eevhah5inj6jl2sy3nas/basic_rdf_query_service_logback.xml
export LOG_CONFIG=/hd/eneco/wikidata/basic_rdf_query_service_logback.xml
ls -l $(echo $LOG_CONFIG)
-rw-rw-r-- 1 wf wf 3030 May 3 09:23 /hd/eneco/wikidata/basic_rdf_query_service_logback.xml
nohup service/runBlazegraph.sh 2>&1 > blazegraph.log&
wf@sun:/hd/seel/wikidata/data/split$ mv wikidump-000000001.ttl.gz.fail wikidump-000000001.ttl.gz
service/loadall.sh -s 1 -e 1
# in another terminal:
...
date;du -sm service/wikidata.jnl
Di 2. Mai 19:28:12 CEST 2023
1530 service/wikidata.jnl
date;du -sm service/wikidata.jnl
Di 2. Mai 19:28:15 CEST 2023
1557 service/wikidata.jnl
<?xml version="1.0"?><data modified="0" milliseconds="483615"/>
wikidump-000000001.ttl.gz.good
with fixed logback.xml
check fix
diff logback.xml basic_rdf_query_service_logback.xml
12c12
< <file>/var/log/wdqs/rdf-query-service.log</file>
---
> <file>PATH/TO/LOGS/rdf-query-service.log</file>
15c15
< <fileNamePattern>/var/log/wdqs/rdf-query-service.%d{yyyy-MM-dd}.%i.log.gz</fileNamePattern>
---
> <fileNamePattern>PATH/TO/LOGS/rdf-query-service.%d{yyyy-MM-dd}.%i.log.gz</fileNamePattern>
restart blazegraph
pgrep -fla blazegraph-service
pkill -f blazegraph-service
rm blazegraph.log
export LOG_CONFIG=/hd/eneco/wikidata/logback.xml
nohup service/runBlazegraph.sh 2>&1 > blazegraph.log&
ls -l service/wikidata.jnl
-rw-rw-r-- 1 wf wf 209715200 May 3 09:41 service/wikidata.jnl
Progress
Count triples in blazegraph
sparql query
SELECT ( COUNT( * ) AS ?count ) { ?s ?p ?o }
queries.yaml
in $HOME/.pylodstorage
'triplecount':
sparql: |
SELECT ( COUNT( * ) AS ?count ) { ?s ?p ?o }
endpoints.yaml
# SPARQL endpoints for sparqlquery tool
# 2023-05-03 WF
'wdimport':
endpoint: http://localhost:9999/bigdata/namespace/wdq/sparql
website: http://blazegraph.wikidata.dbis.rwth-aachen.de
database: blazegraph
lang: sparql
prefixes: |
PREFIX bd: <http://www.bigdata.com/rdf#>
PREFIX cc: <http://creativecommons.org/ns#>
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX geo: <http://www.opengis.net/ont/geosparql#>
PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX pq: <http://www.wikidata.org/prop/qualifier/>
PREFIX pqn: <http://www.wikidata.org/prop/qualifier/value-normalized/>
PREFIX pqv: <http://www.wikidata.org/prop/qualifier/value/>
PREFIX pr: <http://www.wikidata.org/prop/reference/>
PREFIX prn: <http://www.wikidata.org/prop/reference/value-normalized/>
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX prv: <http://www.wikidata.org/prop/reference/value/>
PREFIX ps: <http://www.wikidata.org/prop/statement/>
PREFIX psn: <http://www.wikidata.org/prop/statement/value-normalized/>
PREFIX psv: <http://www.wikidata.org/prop/statement/value/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdata: <http://www.wikidata.org/wiki/Special:EntityData/>
PREFIX wdno: <http://www.wikidata.org/prop/novalue/>
PREFIX wdref: <http://www.wikidata.org/reference/>
PREFIX wds: <http://www.wikidata.org/entity/statement/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wdtn: <http://www.wikidata.org/prop/direct-normalized/>
PREFIX wdv: <http://www.wikidata.org/value/>
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
test sparqlquery command line tool
sparqlquery -qn triplecount -en wdimport -f json
[
{
"count": 457000496
}
]
stats / ETA script
using https://pypi.org/project/pylodstorage/ sparqlquery command line
#!/bin/bash
# WF 2023-05-03
# statistics for wikidata loading
log=/tmp/log$$
cat loadall.log > $log
cat nohup.out >>$log
isodate=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
triples=$(sparqlquery -en wdimport -qn triplecount -f json | jq ".[].count")
cat $log | awk -v isodate=$isodate -v total_files=1058 -v triples=$triples '
BEGIN {
FS="="
printf(" #: load s total s avg s ETA h\n");
}
/fileOrDirs=/ {
filenum=get_number($2)
next
}
/<data modified/ {
msecs=get_number($4)
sec=msecs/1000
totals+=sec
avg=totals/filenum
eta=(total_files-filenum)*avg/3600
printf("%4d: %6d %8d %6.0f %8.1f\n",filenum,sec,totals,avg,eta);
}
function get_number(num_str) {
if (match(num_str, /[0-9]+/)) {
num=substr(num_str, RSTART, RLENGTH)
return num
}
}
END {
printf("%s:%d\n",isodate,triples)
printf("%6.3f bill triples %6.0f triples/s\n",triples/1000000000,triples/totals)
}
'
rm $log
progress on wikidata dbis
Triples:
2023-05-03T09:38:00Z: 288488184 2023-05-03T11:22:52Z: 476800190 2023-05-03T16:17:41Z: 874762589 2023-05-04T05:51:17Z:1566635203 2023-05-05T04:48:47Z:2320966288 2023-05-06T05:20:33Z:2921548912 2023-05-07T06:59:52Z:3406082541 2023-05-08T03:32:44Z:3750926849 2023-05-09T04:44:11Z:4004639109 2023-05-11T04:01:02Z:439241844 recent: 2278 triples/s ETA: 52.5 days
./stats
#: load s total s avg s ETA h
1: 331 331 331 97.2
2: 355 686 343 100.7
3: 394 1081 360 105.6
4: 445 1526 382 111.8
5: 404 1931 386 113.0
6: 387 2319 387 113.0
7: 401 2720 389 113.5
8: 426 3147 393 114.7
9: 435 3582 398 116.0
10: 200 3782 378 110.1
11: 201 3984 362 105.4
12: 213 4198 350 101.7
13: 318 4517 347 100.9
14: 278 4795 343 99.3
15: 232 5028 335 97.1
16: 160 5188 324 93.9
17: 372 5560 327 94.6
18: 425 5986 333 96.1
19: 421 6407 337 97.3
20: 359 6767 338 97.6
21: 260 7027 335 96.4
22: 311 7339 334 96.0
23: 308 7647 333 95.6
24: 290 7938 331 95.0
25: 375 8314 333 95.4
26: 274 8589 330 94.7
27: 266 8855 328 93.9
28: 432 9287 332 94.9
29: 275 9563 330 94.3
30: 210 9773 326 93.0
31: 549 10323 333 95.0
32: 629 10952 342 97.5
33: 575 11527 349 99.5
34: 429 11957 352 100.0
35: 627 12584 360 102.2
36: 317 12902 358 101.7
37: 180 13082 354 100.3
...
54: 1060 30615 567 158.1
...
106: 2458 77958 735 194.5
...
170: 929 160254 943 232.5
...
200: 1756 250171 1251 298.1
...
239: 3384 339178 1419 322.9
...
291: 3704 503140 1729 368.4
...
311: 8835 673480 2166 449.3