Difference between revisions of "Wikidata Import 2023-05-10"

From BITPlan Wiki
Jump to navigation Jump to search
 
(14 intermediate revisions by the same user not shown)
Line 1: Line 1:
{{PageSequence|prev=Wikidata Import 2023-05-05|next=Wikidata Import 2023-05-10|category=Wikidata|categoryIcon=cloud-download}}
+
{{PageSequence|prev=Wikidata Import 2023-05-05|next=Wikidata Import 2023-05-14|category=Wikidata|categoryIcon=cloud-download}}
 +
=Import=
 +
 
 +
{{Import
 +
|url=https://wiki.bitplan.com/index.php/Wikidata_Import_2023-05-10
 +
|target=blazegraph
 +
|start=2023-05-10
 +
|triples=14.7
 +
|state=❌
 +
|ram=64
 +
|os=Ubuntu 22.04.2 LTS
 +
|cpu=Intel(R) Xeon(R) CPU X5690@3.47GHz
 +
|comment=aborted since VMWare Fusion 11.1.1 SSD access too slow
 +
|storemode=property
 +
}}
 +
 
 
= Download =
 
= Download =
 
== Download Options ==
 
== Download Options ==
Line 67: Line 82:
 
== Preparation ==
 
== Preparation ==
 
see  [[Wikidata_Import_2023-04-26#Preparation_.7E20-30_min]]
 
see  [[Wikidata_Import_2023-04-26#Preparation_.7E20-30_min]]
 +
<source lang='bash'>
 +
tar xvfz wikidata-query-rdf/dist/target/service-0.3.124-SNAPSHOT-dist.tar.gz
 +
ln -s service-0.3.124-SNAPSHOT service
 +
</source>
  
 
== calling munge.sh ==
 
== calling munge.sh ==
Line 77: Line 96:
 
</source>
 
</source>
 
=== start domunge.sh and show nohup.out log ===
 
=== start domunge.sh and show nohup.out log ===
 +
<source lang='bash highlight='1-3'>
 +
nohup ./domunge.sh &
 +
tail -f  nohup.out
 +
#logback.classic pattern: %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
 +
19:43:53.292 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO  org.wikidata.query.rdf.tool.Munge - Switching to data/wikidump-000000001.ttl.gz
 +
</source>
 +
= Loading =
 +
== prepare log directory ==
 +
<source lang='bash'>
 +
sudo mkdir -p /var/log/wdqs/
 +
sudo chown $(id -un) /var/log/wdqs/
 +
</source>
 +
== logback.xml ==
 +
<source lang='xml'>
 +
<?xml version="1.0" encoding="UTF-8"?>
 +
<configuration scan="true"  scanPeriod="5 minutes" packagingData="false">
 +
 +
    <!-- ugly trick to ensure ${HOSTNAME} is evaluated -->
 +
    <property scope="context" name="hostname" value="${HOSTNAME}" />
 +
 +
    <!--
 +
        File based logs:
 +
        * rolling every day or when size > 100MB
 +
    -->
 +
    <appender name="file" class="ch.qos.logback.core.rolling.RollingFileAppender">
 +
        <file>/var/log/wdqs//rdf-query-service.log</file>
 +
        <rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy">
 +
            <!-- daily rollover -->
 +
            <fileNamePattern>/var/log/wdqs//rdf-query-service.%d{yyyy-MM-dd}.%i.log.gz</fileNamePattern>
 +
            <maxFileSize>100MB</maxFileSize>
 +
            <maxHistory>30</maxHistory>
 +
        </rollingPolicy>
 +
        <filter class="org.wikidata.query.rdf.common.log.PerLoggerThrottler" />
 +
        <encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
 +
            <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg %mdc%n%rEx{1,QUERY_TIMEOUT,SYNTAX_ERROR}</pattern>
 +
            <outputPatternAsHeader>true</outputPatternAsHeader>
 +
        </encoder>
 +
    </appender>
 +
    <appender name="async-file" class="ch.qos.logback.classic.AsyncAppender">
 +
        <neverBlock>true</neverBlock>
 +
        <appender-ref ref="file" />
 +
    </appender>
 +
 +
    <!--
 +
        Console based logs:
 +
        * per logger / message throttling is enabled
 +
        * limited to 10 messages per second
 +
        * level => ERROR
 +
    -->
 +
    <appender name="stdout" class="ch.qos.logback.core.ConsoleAppender">
 +
        <filter class="org.wikidata.query.rdf.common.log.PerLoggerThrottler" />
 +
        <filter class="org.wikidata.query.rdf.common.log.RateLimitFilter">
 +
            <bucketCapacity>10</bucketCapacity>
 +
            <refillIntervalInMillis>1000</refillIntervalInMillis>
 +
        </filter>
 +
        <filter class="ch.qos.logback.classic.filter.ThresholdFilter">
 +
            <level>error</level>
 +
        </filter>
 +
        <encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
 +
            <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg %mdc%n%rEx{1,QUERY_TIMEOUT,SYNTAX_ERROR}</pattern>
 +
            <outputPatternAsHeader>true</outputPatternAsHeader>
 +
        </encoder>
 +
    </appender>
 +
    <appender name="async-stdout" class="ch.qos.logback.classic.AsyncAppender">
 +
        <neverBlock>true</neverBlock>
 +
        <appender-ref ref="stdout" />
 +
    </appender>
 +
 +
    <root level="info">
 +
        <appender-ref ref="async-file"/>
 +
        <appender-ref ref="async-stdout"/>
 +
    </root>
 +
 +
    <logger name="org.wikidata.query.rdf" level="info"/>
 +
    <logger name="org.wikidata.query.rdf.blazegraph.inline.literal.AbstractMultiTypeExtension" level="error"/>
 +
    <logger name="com.bigdata" level="warn"/>
 +
    <logger name="com.bigdata.util.concurrent.Haltable" level="off"/>
 +
    <logger name="com.bigdata.rdf.internal.LexiconConfiguration" level="off"/> <!-- disabled temp. ref: T207643 -->
 +
 +
</configuration>
 +
</source>
 +
== split file move ==
 +
to allow loading while munging is still in the works move a few hundred split files to the split directory
 +
<source lang='bash'>
 +
mv wikidump-0000000* split/
 +
mv wikidump-0000001* split/
 +
mv wikidump-0000002* split/
 +
</source>
 +
== start blazegraph ==
 +
<source lang='bash'>
 +
export LOG_CONFIG=/hd/tepig/wikidata/logback.xml
 +
nohup service/runBlazegraph.sh  2>&1 > blazegraph.log&
 +
</source>
 +
== start loading ==
 +
copy loadall.sh from sun
 +
<source lang='bash'>
 +
nohup service/loadall.sh 2>&1 > load.log&
 +
</source>
 +
== statistics ==
 +
<pre>
 +
2023-05-11T11:40:49Z:397107129
 +
#: load s  total s  avg s  ETA h
 +
  1:    659      659    659    193,5
 +
31:  1218    22206    716    204,4
 +
0,397 bill triples  17882 triples/s
 +
</pre>

Latest revision as of 08:12, 15 May 2023

Import

Import
edit
state  ❌
url  https://wiki.bitplan.com/index.php/Wikidata_Import_2023-05-10
target  blazegraph
start  2023-05-10
end  
days  
os  Ubuntu 22.04.2 LTS
cpu  Intel(R) Xeon(R) CPU X5690@3.47GHz
ram  64
triples  14.7
comment  aborted since VMWare Fusion 11.1.1 SSD access too slow


Download

Download Options

https://dumps.wikimedia.org/wikidatawiki/entities

dcatap.rdf                                         06-May-2023 02:08               84753
latest-all.json.bz2                                03-May-2023 21:06         81640390615
latest-all.json.gz                                 10-May-2023 13:49        124070020402
latest-all.nt.bz2                                  04-May-2023 16:07        158382342866
latest-all.nt.gz                                   03-May-2023 22:23        205171447838
latest-all.ttl.bz2                                 04-May-2023 03:24        101606862077
latest-all.ttl.gz                                  03-May-2023 17:08        124093922794
latest-lexemes.json.bz2                            10-May-2023 03:57           306901617
latest-lexemes.json.gz                             10-May-2023 03:55           418171562
latest-lexemes.nt.bz2                              05-May-2023 23:36           793805750
latest-lexemes.nt.gz                               05-May-2023 23:30          1035632811
latest-lexemes.ttl.bz2                             05-May-2023 23:31           450346788
latest-lexemes.ttl.gz                              05-May-2023 23:27           559471601
latest-truthy.nt.bz2                               06-May-2023 01:38         36065028020
latest-truthy.nt.gz                                05-May-2023 22:20         59829390689

download script

cat download.sh 
#/bin/bash
# WF 2023-04-26
# download wikidata dumps
baseurl=https://dumps.wikimedia.org/wikidatawiki/entities/
for file in latest-all latest-lexemes
do
  for ext in ttl.bz2
  do
    url=$baseurl/$file.$ext
    log=$file-$ext.log
    nohup wget $url >> $log&
  done
done

Download logs

latest-all-ttl.bz2.log <==
==> latest-all-ttl.bz2.log <==
--2023-05-10 11:26:56--  https://dumps.wikimedia.org/wikidatawiki/entities//latest-all.ttl.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620:0:861:2:208:80:154:142, 208.80.154.142
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620:0:861:2:208:80:154:142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 101606862077 (95G) [application/octet-stream]
Saving to: ‘latest-all.ttl.bz2’

     0K .......... .......... .......... .......... ..........  0%  456K 2d12h
    50K .......... .......... .......... .......... ..........  0%  473K 2d11h
   100K .......... .......... .......... .......... ..........  0% 6,85M 40h50m

99225150K .......... .......... .......... .......... .......... 99% 3,32M 0s
99225200K .......... .......... .......... .......... .......... 99% 3,94M 0s
99225250K .......... .......... .......... .......... .......... 99% 7,52M 0s
99225300K .......... .......... .......... .......... .......... 99% 3,75M 0s
99225350K .......... .......... .......... .......... .......... 99% 3,64M 0s
99225400K .......... .......... .......... .......... .......... 99% 4,09M 0s
99225450K .                                                     100% 2,32T=6h14m

2023-05-10 17:41:25 (4,31 MB/s) - ‘latest-all.ttl.bz2’ saved [101606862077/101606862077]

Munging ~29 h

Preparation

see Wikidata_Import_2023-04-26#Preparation_.7E20-30_min

tar xvfz wikidata-query-rdf/dist/target/service-0.3.124-SNAPSHOT-dist.tar.gz 
ln -s service-0.3.124-SNAPSHOT service

calling munge.sh

domunge.sh

#!/bin/bash
# WF 2023-04-29
# start munge in background
bzcat latest-all.ttl.bz2 | service/munge.sh -f - -d data -- --skolemize

start domunge.sh and show nohup.out log

nohup ./domunge.sh &
tail -f  nohup.out
#logback.classic pattern: %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
19:43:53.292 [org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler$RDFActionsReplayer] INFO  org.wikidata.query.rdf.tool.Munge - Switching to data/wikidump-000000001.ttl.gz

Loading

prepare log directory

sudo mkdir -p /var/log/wdqs/
sudo chown $(id -un) /var/log/wdqs/

logback.xml

<?xml version="1.0" encoding="UTF-8"?>
<configuration scan="true"  scanPeriod="5 minutes" packagingData="false">

    <!-- ugly trick to ensure ${HOSTNAME} is evaluated -->
    <property scope="context" name="hostname" value="${HOSTNAME}" />

    <!--
        File based logs:
        * rolling every day or when size > 100MB
    -->
    <appender name="file" class="ch.qos.logback.core.rolling.RollingFileAppender">
        <file>/var/log/wdqs//rdf-query-service.log</file>
        <rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy">
            <!-- daily rollover -->
            <fileNamePattern>/var/log/wdqs//rdf-query-service.%d{yyyy-MM-dd}.%i.log.gz</fileNamePattern>
            <maxFileSize>100MB</maxFileSize>
            <maxHistory>30</maxHistory>
        </rollingPolicy>
        <filter class="org.wikidata.query.rdf.common.log.PerLoggerThrottler" />
        <encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
            <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg %mdc%n%rEx{1,QUERY_TIMEOUT,SYNTAX_ERROR}</pattern>
            <outputPatternAsHeader>true</outputPatternAsHeader>
        </encoder>
    </appender>
    <appender name="async-file" class="ch.qos.logback.classic.AsyncAppender">
        <neverBlock>true</neverBlock>
        <appender-ref ref="file" />
    </appender>

    <!--
        Console based logs:
        * per logger / message throttling is enabled
        * limited to 10 messages per second
        * level => ERROR
    -->
    <appender name="stdout" class="ch.qos.logback.core.ConsoleAppender">
        <filter class="org.wikidata.query.rdf.common.log.PerLoggerThrottler" />
        <filter class="org.wikidata.query.rdf.common.log.RateLimitFilter">
            <bucketCapacity>10</bucketCapacity>
            <refillIntervalInMillis>1000</refillIntervalInMillis>
        </filter>
        <filter class="ch.qos.logback.classic.filter.ThresholdFilter">
            <level>error</level>
        </filter>
        <encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
            <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg %mdc%n%rEx{1,QUERY_TIMEOUT,SYNTAX_ERROR}</pattern>
            <outputPatternAsHeader>true</outputPatternAsHeader>
        </encoder>
    </appender>
    <appender name="async-stdout" class="ch.qos.logback.classic.AsyncAppender">
        <neverBlock>true</neverBlock>
        <appender-ref ref="stdout" />
    </appender>

    <root level="info">
        <appender-ref ref="async-file"/>
        <appender-ref ref="async-stdout"/>
    </root>

    <logger name="org.wikidata.query.rdf" level="info"/>
    <logger name="org.wikidata.query.rdf.blazegraph.inline.literal.AbstractMultiTypeExtension" level="error"/>
    <logger name="com.bigdata" level="warn"/>
    <logger name="com.bigdata.util.concurrent.Haltable" level="off"/>
    <logger name="com.bigdata.rdf.internal.LexiconConfiguration" level="off"/> <!-- disabled temp. ref: T207643 -->

</configuration>

split file move

to allow loading while munging is still in the works move a few hundred split files to the split directory

mv wikidump-0000000* split/
mv wikidump-0000001* split/
mv wikidump-0000002* split/

start blazegraph

export LOG_CONFIG=/hd/tepig/wikidata/logback.xml
nohup service/runBlazegraph.sh  2>&1 > blazegraph.log&

start loading

copy loadall.sh from sun

nohup service/loadall.sh 2>&1 > load.log&

statistics

2023-05-11T11:40:49Z:397107129
#: load s  total s  avg s   ETA h
  1:    659      659    659    193,5
 31:   1218    22206    716    204,4
 0,397 bill triples  17882 triples/s