Difference between revisions of "Truly Tabular RDF/GND"
Jump to navigation
Jump to search
(2 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
+ | {{Link|target=Truly Tabular RDF}} | ||
{{GlossaryEntry | {{GlossaryEntry | ||
|responsible=https://www.dnb.de/DE/Professionell/Standardisierung/GND/gnd.html | |responsible=https://www.dnb.de/DE/Professionell/Standardisierung/GND/gnd.html | ||
Line 20: | Line 21: | ||
* https://blog.lobid.org/2018/08/27/openrefine.html | * https://blog.lobid.org/2018/08/27/openrefine.html | ||
== GND property multiplicity == | == GND property multiplicity == | ||
− | + | Based on analysis of https://data.dnb.de/opendata/authorities-kongress_lds.ttl.gz as of 2021-12 | |
{| class="wikitable" | {| class="wikitable" | ||
|+ Property multiplicity | |+ Property multiplicity | ||
Line 122: | Line 123: | ||
| style=" "| 0.0163 | | style=" "| 0.0163 | ||
|} | |} | ||
+ | === performance optimized query of GND event details === | ||
+ | <source lang='sparql> | ||
+ | # performance optimized query of GND event details | ||
+ | # with aggregated properties as single, count and | separated list column | ||
+ | # WF 2021-12-05 | ||
+ | PREFIX gndi: <https://d-nb.info/gnd> | ||
+ | PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#> | ||
+ | PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/> | ||
+ | PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> | ||
+ | PREFIX owl: <http://www.w3.org/2002/07/owl#> | ||
+ | PREFIX dc: <http://purl.org/dc/terms/> | ||
+ | PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#> | ||
+ | |||
+ | SELECT | ||
+ | ?event | ||
+ | ?eventId | ||
+ | (MIN(?eventTitle) as ?title) | ||
+ | |||
+ | (COUNT (DISTINCT ?eventDate) as ?dateCount) | ||
+ | (MIN(?eventDate) as ?date) | ||
+ | |||
+ | (MIN(?eventAcronym) as ?acronym) | ||
+ | (COUNT (DISTINCT ?eventAcronym) as ?acronymCount) | ||
+ | (GROUP_CONCAT(DISTINCT ?eventAcronym; SEPARATOR="| ") AS ?acronyms) | ||
+ | |||
+ | (MIN(?eventVariant) as ?variant) | ||
+ | (COUNT (DISTINCT ?eventVariant) as ?variantCount) | ||
+ | (GROUP_CONCAT(DISTINCT ?eventVariant; SEPARATOR="| ") AS ?variants) | ||
+ | |||
+ | (MIN(?eventPlace) as ?place) | ||
+ | (COUNT (DISTINCT ?eventPlace) as ?placeCount) | ||
+ | (GROUP_CONCAT(DISTINCT ?eventPlace; SEPARATOR="| ") AS ?places) | ||
+ | |||
+ | (MIN(?eventHomepage) as ?homepage) | ||
+ | WHERE { | ||
+ | ?event a gnd:ConferenceOrEvent. | ||
+ | ?event gnd:gndIdentifier ?eventId. | ||
+ | ?event gnd:preferredNameForTheConferenceOrEvent ?eventTitle. | ||
+ | OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?eventAcronym. } | ||
+ | OPTIONAL { ?event gnd:homepage ?eventHomepage. } | ||
+ | OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?eventVariant. } | ||
+ | OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?eventDate. } | ||
+ | OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?eventPlace } | ||
+ | # only available 3520 times 2021-12 | ||
+ | # ?event gnd:topic ?topic. | ||
+ | # only available 12106 times 2021-12 | ||
+ | # ?event gnd:precedingConferenceOrEvent ?prec | ||
+ | # only available 11929 times 2021-12 | ||
+ | #?event gnd:succeedingConferenceOrEvent ?succ | ||
+ | } | ||
+ | GROUP BY ?event ?eventId | ||
+ | </source> | ||
=== query to analyze multiplicity === | === query to analyze multiplicity === | ||
<source lang='sparql> | <source lang='sparql> | ||
Line 161: | Line 214: | ||
</source> | </source> | ||
− | == | + | == SPARQL Queries == |
− | <source lang=' | + | === entities and usage frequency === |
− | # | + | <source lang='SPARQL'> |
− | + | # get histogramm data of entities by | |
− | + | # usage frequency | |
− | + | # WF 2020-06-27 | |
− | + | PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#> | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | SELECT ?c (COUNT(?c) AS ?count) | |
− | + | WHERE { | |
− | + | ?subject a ?c | |
− | + | } | |
+ | GROUP BY ?c | ||
+ | HAVING (?count >100) | ||
+ | ORDER BY DESC(?count) | ||
+ | </source> | ||
+ | {|class="wikitable sortable" | ||
+ | |+ | ||
+ | !c!!count | ||
+ | |- | ||
+ | ||gnd#ConferenceOrEvent||713310 | ||
+ | |- | ||
+ | ||gnd#TerritorialCorporateBodyOrAdministrativeUnit||188246 | ||
+ | |- | ||
+ | ||gnd#SeriesOfConferenceOrEvent||122970 | ||
+ | |- | ||
+ | ||gnd#BuildingOrMemorial||67149 | ||
+ | |- | ||
+ | ||http://www.opengis.net/ont/sf#Point||57987 | ||
+ | |- | ||
+ | ||gnd#PlaceOrGeographicName||27771 | ||
+ | |- | ||
+ | ||gnd#NaturalGeographicUnit||20269 | ||
+ | |- | ||
+ | ||gnd#AdministrativeUnit||12846 | ||
+ | |- | ||
+ | ||gnd#WayBorderOrLine||4971 | ||
+ | |- | ||
+ | ||gnd#ReligiousTerritory||2646 | ||
+ | |- | ||
+ | ||gnd#NameOfSmallGeographicUnitLyingWithinAnotherGeographicUnit||2113 | ||
+ | |- | ||
+ | ||gnd#CorporateBody||559 | ||
+ | |- | ||
+ | ||gnd#MemberState||543 | ||
+ | |- | ||
+ | ||gnd#Country||307 | ||
+ | |- | ||
+ | ||gnd#ExtraterrestrialTerritory||282 | ||
+ | |- | ||
+ | ||gnd#Language||193 | ||
+ | |- | ||
+ | ||gnd#ReligiousCorporateBody||155 | ||
+ | |- | ||
+ | ||gnd#ReligiousAdministrativeUnit||134 | ||
+ | |- | ||
+ | ||gnd#HistoricSingleEventOrEra||128 | ||
+ | |} | ||
− | + | === relevance of fields === | |
− | + | <source lang='SPARQL'> | |
− | + | # get histogramm data of properties by | |
− | + | # usage frequency | |
− | + | # WF 2020-07-12 | |
− | + | PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#> | |
+ | PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> | ||
+ | PREFIX owl: <http://www.w3.org/2002/07/owl#> | ||
+ | PREFIX dc: <http://purl.org/dc/terms/> | ||
+ | PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#> | ||
− | + | SELECT ?property (COUNT(?property) AS ?propTotal) | |
− | + | WHERE { ?s ?property ?o . } | |
− | + | GROUP BY ?property | |
− | + | HAVING (?propTotal >1000) | |
− | + | ORDER BY DESC(?propTotal) | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
</source> | </source> | ||
− | = | + | {|class="wikitable sortable" |
− | + | |+ | |
− | + | !property!!propTotal | |
− | + | |- | |
− | + | ||http://www.w3.org/2002/07/owl#sameAs||1359115 | |
− | + | |- | |
− | + | ||http://www.w3.org/1999/02/22-rdf-syntax-ns#type||1222895 | |
− | + | |- | |
− | + | ||gnd#geographicAreaCode||1195102 | |
− | + | |- | |
− | + | ||http://purl.org/dc/terms/license||1149468 | |
− | | | + | |- |
+ | ||http://purl.org/dc/terms/modified||1149468 | ||
+ | |- | ||
+ | ||http://www.w3.org/2007/05/powder-s#describedby||1149468 | ||
+ | |- | ||
+ | ||gnd#gndIdentifier||1149468 | ||
+ | |- | ||
+ | ||gnd#oldAuthorityNumber||940550 | ||
+ | |- | ||
+ | ||gnd#preferredNameForTheConferenceOrEvent||836397 | ||
+ | |- | ||
+ | ||gnd#variantNameForTheConferenceOrEvent||720355 | ||
+ | |- | ||
+ | ||gnd#dateOfConferenceOrEvent||693196 | ||
+ | |- | ||
+ | ||gnd#placeOfConferenceOrEvent||650165 | ||
+ | |- | ||
+ | ||gnd#preferredNameForThePlaceOrGeographicName||313058 | ||
+ | |- | ||
+ | ||gnd#variantNameForThePlaceOrGeographicName||298383 | ||
+ | |- | ||
+ | ||gnd#gndSubjectCategory||176395 | ||
+ | |- | ||
+ | ||gnd#definition||157695 | ||
+ | |- | ||
+ | ||gnd#broaderTermInstantial||117334 | ||
+ | |- | ||
+ | ||gnd#place||82345 | ||
+ | |- | ||
+ | ||http://xmlns.com/foaf/0.1/page||74542 | ||
+ | |- | ||
+ | ||http://www.opengis.net/ont/geosparql#asWKT||58083 | ||
+ | |- | ||
+ | ||http://www.opengis.net/ont/geosparql#hasGeometry||58083 | ||
+ | |- | ||
+ | ||https://d-nb.info/standards/elementset/dnb#deprecatedUri||51576 | ||
+ | |- | ||
+ | ||gnd#biographicalOrHistoricalInformation||34190 | ||
+ | |- | ||
+ | ||gnd#organizerOrHost||34167 | ||
+ | |- | ||
+ | ||gnd#relatedDdcWithDegreeOfDeterminacy2||25452 | ||
+ | |- | ||
+ | ||gnd#succeedingPlaceOrGeographicName||25208 | ||
+ | |- | ||
+ | ||gnd#broaderTermPartitive||25111 | ||
+ | |- | ||
+ | ||gnd#dateOfEstablishment||24772 | ||
+ | |- | ||
+ | ||gnd#homepage||24439 | ||
+ | |- | ||
+ | ||gnd#precedingPlaceOrGeographicName||22490 | ||
+ | |- | ||
+ | ||gnd#dateOfTermination||21582 | ||
+ | |- | ||
+ | ||gnd#hierarchicalSuperiorOfPlaceOrGeographicName||20522 | ||
+ | |- | ||
+ | ||gnd#precedingConferenceOrEvent||17164 | ||
+ | |- | ||
+ | ||gnd#succeedingConferenceOrEvent||16963 | ||
+ | |- | ||
+ | ||gnd#dateOfProduction||15106 | ||
+ | |- | ||
+ | ||gnd#spatialAreaOfActivity||13593 | ||
+ | |- | ||
+ | ||gnd#hierarchicalSuperiorOfTheConferenceOrEvent||10126 | ||
+ | |- | ||
+ | ||gnd#architect||8206 | ||
+ | |- | ||
+ | ||gnd#topic||6088 | ||
|- | |- | ||
− | + | ||gnd#relatedPlaceOrGeographicName||5812 | |
|- | |- | ||
− | | | + | ||gnd#abbreviatedNameForTheConferenceOrEvent||5312 |
|- | |- | ||
− | | | + | ||gnd#complexSeeReferenceSubject||3927 |
|- | |- | ||
− | | | + | ||gnd#startingOrFinalPointOfADistance||3531 |
|- | |- | ||
− | | | + | ||gnd#relatedConferenceOrEvent||3124 |
|- | |- | ||
− | | | + | ||gnd#relatedDdcWithDegreeOfDeterminacy4||2709 |
|- | |- | ||
− | | | + | ||gnd#relatedCorporateBody||2354 |
|- | |- | ||
− | | | + | ||gnd#relatedTerm||1728 |
|- | |- | ||
− | | | + | ||gnd#sponsorOrPatron||1281 |
|- | |- | ||
− | | | + | ||gnd#relatedDdcWithDegreeOfDeterminacy3||1200 |
|- | |- | ||
− | | | + | ||gnd#exhibitor||1044 |
|} | |} | ||
− | }} | + | {{Fixme|todo=Unfortunately the headlines and the forms seem to be mixed up.|done=2020-06-25|by=--[[User:Wf|Wf]] ([[User talk:Wf|talk]]) 14:59, 25 June 2020 (CEST)}} |
+ | |||
+ | == events with most often used fields and seldom but useful fields == | ||
+ | <source lang='sparql'> | ||
+ | # get events with most often used columns from GND | ||
+ | # plus acronym, topic, homepage (seldom but useful) | ||
+ | # WF 2020-07-12 | ||
+ | PREFIX gndi: <https://d-nb.info/gnd> | ||
+ | PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#> | ||
+ | PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/> | ||
+ | PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> | ||
+ | PREFIX owl: <http://www.w3.org/2002/07/owl#> | ||
+ | PREFIX dc: <http://purl.org/dc/terms/> | ||
+ | PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#> | ||
+ | |||
+ | SELECT ?event ?eventId ?acronym ?variant ?name ?date ?areaCode ?place ?topic ?homepage | ||
+ | WHERE { | ||
+ | ?event gnd:gndIdentifier ?eventId. | ||
+ | OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?acronym. } | ||
+ | OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?variant.} | ||
+ | OPTIONAL { ?event gnd:preferredNameForTheConferenceOrEvent ?name.} | ||
+ | OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?date. } | ||
+ | OPTIONAL { ?event gnd:geographicAreaCode ?areaCode. } | ||
+ | OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?place. } | ||
+ | OPTIONAL { ?event gnd:topic ?topic. } | ||
+ | OPTIONAL { ?event gnd:homepage ?homepage. } | ||
+ | } | ||
+ | #LIMIT 10000 | ||
+ | </source> | ||
+ | |||
+ | == Script to setup a Jena instance with GND data extract == | ||
+ | === gnd2jena === | ||
+ | <source lang='bash'> | ||
+ | #!/bin/bash | ||
+ | # WF 2020-05-10 | ||
+ | |||
+ | # global settings | ||
+ | jena=apache-jena-4.4.0 | ||
+ | tgz=$jena.tar.gz | ||
+ | #mirror=https://downloads.apache.org/jena/binaries | ||
+ | mirror=https://archive.apache.org/dist/jena/binaries | ||
+ | jenaurl=$mirror/$tgz | ||
+ | base=/hd/seel/gnd | ||
+ | #base=/hd/torterra/gnd | ||
+ | #base=/hd/luxio/gnd | ||
+ | data=$base/data | ||
+ | tdbloader=$jena/bin/tdb2.tdbloader | ||
+ | |||
+ | getjena() { | ||
+ | # download | ||
+ | if [ ! -f $tgz ] | ||
+ | then | ||
+ | echo "downloading $tgz from $jenaurl" | ||
+ | wget $jenaurl | ||
+ | else | ||
+ | echo "$tgz already downloaded" | ||
+ | fi | ||
+ | # unpack | ||
+ | if [ ! -d $jena ] | ||
+ | then | ||
+ | echo "unpacking $jena from $tgz" | ||
+ | tar xvzf $tgz | ||
+ | else | ||
+ | echo "$jena already unpacked" | ||
+ | fi | ||
+ | # create data directory | ||
+ | if [ ! -d $data ] | ||
+ | then | ||
+ | echo "creating $data directory" | ||
+ | mkdir -p $data | ||
+ | else | ||
+ | echo "$data directory already created" | ||
+ | fi | ||
+ | } | ||
+ | |||
+ | # | ||
+ | # show the given timestamp | ||
+ | # | ||
+ | timestamp() { | ||
+ | local msg="$1" | ||
+ | local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ") | ||
+ | echo "$msg at $ts" | ||
+ | } | ||
+ | |||
+ | # | ||
+ | # load data for the given data dir and input | ||
+ | # | ||
+ | loaddata() { | ||
+ | local data="$1" | ||
+ | local input="$2" | ||
+ | timestamp "start loading $input to $data" | ||
+ | $tdbloader --loc "$data" "$input" > tdb2-$phase-out.log 2> tdb2-$phase-err.log | ||
+ | timestamp "finished loading $input to $data" | ||
+ | } | ||
+ | |||
+ | getjena | ||
+ | export TMPDIR=$base/tmp | ||
+ | if [ ! -d $TMPDIR ] | ||
+ | then | ||
+ | echo "creating temporary directory $TMPDIR" | ||
+ | mkdir $TMPDIR | ||
+ | else | ||
+ | echo "using temporary directory $TMPDIR" | ||
+ | fi | ||
+ | for d in kongress geografikum | ||
+ | do | ||
+ | file=authorities-${d}_lds.ttl | ||
+ | if [ ! -f $file ] | ||
+ | then | ||
+ | wget https://data.dnb.de/opendata/$file.gz | ||
+ | gunzip $file.gz | ||
+ | else | ||
+ | echo "$file already downloaded" | ||
+ | fi | ||
+ | loaddata $data $file | ||
+ | done | ||
+ | </source> | ||
+ | |||
+ | == try on confident23 server == | ||
+ | === 2020-07-19 === | ||
+ | <source lang='bash'> | ||
+ | wf@confident23:/usr/local/src$ sudo ./getjena | ||
+ | apache-jena-3.16.0.tar.gz already downloaded | ||
+ | apache-jena-3.16.0 already unpacked | ||
+ | creating /var/data/gnd/data directory | ||
+ | creating temporary directory /var/data/gnd/tmp | ||
+ | start loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-07-19T07:08:15Z | ||
+ | finished loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-07-19T07:11:16Z | ||
+ | </source> | ||
+ | === 2020-09-15 === | ||
+ | <source lang='bash'> | ||
+ | ./getjena | ||
+ | apache-jena-3.16.0.tar.gz already downloaded | ||
+ | apache-jena-3.16.0 already unpacked | ||
+ | creating /var/data/gnd/data directory | ||
+ | creating temporary directory /var/data/gnd/tmp | ||
+ | --2020-09-15 11:46:37-- https://data.dnb.de/opendata/authorities-kongress_lds.ttl.gz | ||
+ | Resolving data.dnb.de (data.dnb.de)... 193.175.100.140 | ||
+ | Connecting to data.dnb.de (data.dnb.de)|193.175.100.140|:443... connected. | ||
+ | HTTP request sent, awaiting response... 200 OK | ||
+ | Length: 80381713 (77M) [application/x-gzip] | ||
+ | Saving to: ‘authorities-kongress_lds.ttl.gz’ | ||
+ | |||
+ | authorities-kongres 100%[===================>] 76.66M 66.3MB/s in 1.2s | ||
+ | |||
+ | 2020-09-15 11:46:38 (66.3 MB/s) - ‘authorities-kongress_lds.ttl.gz’ saved [80381713/80381713] | ||
+ | |||
+ | start loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-09-15T09:46:42Z | ||
+ | finished loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-09-15T09:50:25Z | ||
+ | --2020-09-15 11:50:25-- https://data.dnb.de/opendata/authorities-geografikum_lds.ttl.gz | ||
+ | Resolving data.dnb.de (data.dnb.de)... 193.175.100.140 | ||
+ | Connecting to data.dnb.de (data.dnb.de)|193.175.100.140|:443... connected. | ||
+ | HTTP request sent, awaiting response... 200 OK | ||
+ | Length: 33842092 (32M) [application/x-gzip] | ||
+ | Saving to: ‘authorities-geografikum_lds.ttl.gz’ | ||
+ | |||
+ | authorities-geograf 100%[===================>] 32.27M 53.4MB/s in 0.6s | ||
+ | |||
+ | 2020-09-15 11:50:26 (53.4 MB/s) - ‘authorities-geografikum_lds.ttl.gz’ saved [33842092/33842092] | ||
+ | |||
+ | start loading authorities-geografikum_lds.ttl to /var/data/gnd/data at 2020-09-15T09:50:27Z | ||
+ | finished loading authorities-geografikum_lds.ttl to /var/data/gnd/data at 2020-09-15T09:52:40Z | ||
+ | |||
+ | </source> | ||
+ | |||
+ | == start fuseki == | ||
+ | <source lang='bash'> | ||
+ | #!/bin/bash | ||
+ | # WF 2020-06-25 | ||
+ | # Jena Fuseki server installation | ||
+ | # see https://jena.apache.org/documentation/fuseki2/fuseki-run.html | ||
+ | version=3.16.0 | ||
+ | fuseki=apache-jena-fuseki-$version | ||
+ | if [ ! -d $fuseki ] | ||
+ | then | ||
+ | if [ ! -f $fuseki.tar.gz ] | ||
+ | then | ||
+ | wget http://archive.apache.org/dist/jena/binaries/$fuseki.tar.gz | ||
+ | else | ||
+ | echo $fuseki.tar.gz already downloaded | ||
+ | fi | ||
+ | echo "unpacking $fuseki.tar.gz" | ||
+ | tar xvfz $fuseki.tar.gz | ||
+ | else | ||
+ | echo $fuseki already downloaded and unpacked | ||
+ | fi | ||
+ | cd $fuseki | ||
+ | gnddata=/var/data/gnd/data | ||
+ | java -jar fuseki-server.jar --tdb2 --loc=$gnddata /gnd | ||
+ | wf@confident23:/usr/local/src$ | ||
+ | </source> |
Latest revision as of 09:24, 5 August 2022
GlossaryEntry | |
---|---|
edit | |
responsible | https://www.dnb.de/DE/Professionell/Standardisierung/GND/gnd.html |
state | |
since | 2012 |
description | Gemeinsame Normdatei |
references | |
lang | de |
master | GND |
Wikibase
Report "GND meets Wikibase" 2 Barbara Fischer
Links
- https://d-nb.info/standards/elementset/gnd
- https://d-nb.info/standards/elementset/gnd#SeriesOfConferenceOrEvent
- https://data.dnb.de/opendata/
- https://blog.lobid.org/2018/08/27/openrefine.html
GND property multiplicity
Based on analysis of https://data.dnb.de/opendata/authorities-kongress_lds.ttl.gz as of 2021-12
property | gnd | total | unique | min | max | avg |
---|---|---|---|---|---|---|
eventId | gnd:gndIdentifier | 731651 | 731651 | 1 | 1 | 1 |
title | gnd:preferredNameForTheConferenceOrEvent | 731645 | 731645 | 0 | 1 | 0.999991799 |
acronym | gnd:abbreviatedNameForTheConferenceOrEvent | 3537 | 3206 | 0 | 4 | 0.00483 |
sameAs | owl:sameAs | 769120 | 693077 | 0 | 20 | 1.05 |
variant | gnd:variantNameForTheConferenceOrEvent | 632368 | 229268 | 0 | 41 | 0.86 |
date | gnd:dateOfConferenceOrEvent | 710819 | 704949 | 0 | 9 | 0.971 |
areaCode | gnd:geographicAreaCode | 797037 | 612631 | 0 | 11 | 1.089 |
place | gnd:placeOfConferenceOrEvent | 659305 | 624667 | 0 | 18 | 0.901 |
topic | gnd:topic | 5061 | 3520 | 0 | 6 | 0.00691 |
homepage | gnd:homepage | 19011 | 18702 | 0 | 3 | 0.026 |
prec | gnd:homepage | 12182 | 12106 | 0 | 3 | 0.0166 |
succ | gnd:homepage | 11974 | 11929 | 0 | 3 | 0.0163 |
performance optimized query of GND event details
# performance optimized query of GND event details
# with aggregated properties as single, count and | separated list column
# WF 2021-12-05
PREFIX gndi: <https://d-nb.info/gnd>
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>
SELECT
?event
?eventId
(MIN(?eventTitle) as ?title)
(COUNT (DISTINCT ?eventDate) as ?dateCount)
(MIN(?eventDate) as ?date)
(MIN(?eventAcronym) as ?acronym)
(COUNT (DISTINCT ?eventAcronym) as ?acronymCount)
(GROUP_CONCAT(DISTINCT ?eventAcronym; SEPARATOR="| ") AS ?acronyms)
(MIN(?eventVariant) as ?variant)
(COUNT (DISTINCT ?eventVariant) as ?variantCount)
(GROUP_CONCAT(DISTINCT ?eventVariant; SEPARATOR="| ") AS ?variants)
(MIN(?eventPlace) as ?place)
(COUNT (DISTINCT ?eventPlace) as ?placeCount)
(GROUP_CONCAT(DISTINCT ?eventPlace; SEPARATOR="| ") AS ?places)
(MIN(?eventHomepage) as ?homepage)
WHERE {
?event a gnd:ConferenceOrEvent.
?event gnd:gndIdentifier ?eventId.
?event gnd:preferredNameForTheConferenceOrEvent ?eventTitle.
OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?eventAcronym. }
OPTIONAL { ?event gnd:homepage ?eventHomepage. }
OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?eventVariant. }
OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?eventDate. }
OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?eventPlace }
# only available 3520 times 2021-12
# ?event gnd:topic ?topic.
# only available 12106 times 2021-12
# ?event gnd:precedingConferenceOrEvent ?prec
# only available 11929 times 2021-12
#?event gnd:succeedingConferenceOrEvent ?succ
}
GROUP BY ?event ?eventId
query to analyze multiplicity
# get aggregate counts of property usage
# this query needs to be modified property by property and run twice
# once without the having clause and once with the having clause
# to create the table further down in this wikipage
# WF 2021-12-05
PREFIX gndi: <https://d-nb.info/gnd>
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>
SELECT (sum (?itemCount) as ?sum) (min (?itemCount) as ?min) (max (?itemCount) as ?max) (avg (?itemCount) as ?avg) {
SELECT ?event ?eventId (count(?title) as ?itemCount)
WHERE {
?event a gnd:ConferenceOrEvent.
?event gnd:gndIdentifier ?eventId.
OPTIONAL {
# ?event gnd:abbreviatedNameForTheConferenceOrEvent ?acronym.
# ?event owl:sameAs ?sameAs.
# ?event gnd:variantNameForTheConferenceOrEvent ?variant.
?event gnd:preferredNameForTheConferenceOrEvent ?title.
# ?event gnd:dateOfConferenceOrEvent ?date
# ?event gnd:geographicAreaCode ?areaCode.
# ?event gnd:placeOfConferenceOrEvent ?place.
# ?event gnd:topic ?topic.
# ?event gnd:homepage ?homepage.
# ?event gnd:precedingConferenceOrEvent ?prec
# ?event gnd:succeedingConferenceOrEvent ?succ
}
}
GROUP BY ?event ?eventId
#HAVING(COUNT(?title) = 1)
}
SPARQL Queries
entities and usage frequency
# get histogramm data of entities by
# usage frequency
# WF 2020-06-27
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
SELECT ?c (COUNT(?c) AS ?count)
WHERE {
?subject a ?c
}
GROUP BY ?c
HAVING (?count >100)
ORDER BY DESC(?count)
c | count |
---|---|
gnd#ConferenceOrEvent | 713310 |
gnd#TerritorialCorporateBodyOrAdministrativeUnit | 188246 |
gnd#SeriesOfConferenceOrEvent | 122970 |
gnd#BuildingOrMemorial | 67149 |
http://www.opengis.net/ont/sf#Point | 57987 |
gnd#PlaceOrGeographicName | 27771 |
gnd#NaturalGeographicUnit | 20269 |
gnd#AdministrativeUnit | 12846 |
gnd#WayBorderOrLine | 4971 |
gnd#ReligiousTerritory | 2646 |
gnd#NameOfSmallGeographicUnitLyingWithinAnotherGeographicUnit | 2113 |
gnd#CorporateBody | 559 |
gnd#MemberState | 543 |
gnd#Country | 307 |
gnd#ExtraterrestrialTerritory | 282 |
gnd#Language | 193 |
gnd#ReligiousCorporateBody | 155 |
gnd#ReligiousAdministrativeUnit | 134 |
gnd#HistoricSingleEventOrEra | 128 |
relevance of fields
# get histogramm data of properties by
# usage frequency
# WF 2020-07-12
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>
SELECT ?property (COUNT(?property) AS ?propTotal)
WHERE { ?s ?property ?o . }
GROUP BY ?property
HAVING (?propTotal >1000)
ORDER BY DESC(?propTotal)
property | propTotal |
---|---|
http://www.w3.org/2002/07/owl#sameAs | 1359115 |
http://www.w3.org/1999/02/22-rdf-syntax-ns#type | 1222895 |
gnd#geographicAreaCode | 1195102 |
http://purl.org/dc/terms/license | 1149468 |
http://purl.org/dc/terms/modified | 1149468 |
http://www.w3.org/2007/05/powder-s#describedby | 1149468 |
gnd#gndIdentifier | 1149468 |
gnd#oldAuthorityNumber | 940550 |
gnd#preferredNameForTheConferenceOrEvent | 836397 |
gnd#variantNameForTheConferenceOrEvent | 720355 |
gnd#dateOfConferenceOrEvent | 693196 |
gnd#placeOfConferenceOrEvent | 650165 |
gnd#preferredNameForThePlaceOrGeographicName | 313058 |
gnd#variantNameForThePlaceOrGeographicName | 298383 |
gnd#gndSubjectCategory | 176395 |
gnd#definition | 157695 |
gnd#broaderTermInstantial | 117334 |
gnd#place | 82345 |
http://xmlns.com/foaf/0.1/page | 74542 |
http://www.opengis.net/ont/geosparql#asWKT | 58083 |
http://www.opengis.net/ont/geosparql#hasGeometry | 58083 |
https://d-nb.info/standards/elementset/dnb#deprecatedUri | 51576 |
gnd#biographicalOrHistoricalInformation | 34190 |
gnd#organizerOrHost | 34167 |
gnd#relatedDdcWithDegreeOfDeterminacy2 | 25452 |
gnd#succeedingPlaceOrGeographicName | 25208 |
gnd#broaderTermPartitive | 25111 |
gnd#dateOfEstablishment | 24772 |
gnd#homepage | 24439 |
gnd#precedingPlaceOrGeographicName | 22490 |
gnd#dateOfTermination | 21582 |
gnd#hierarchicalSuperiorOfPlaceOrGeographicName | 20522 |
gnd#precedingConferenceOrEvent | 17164 |
gnd#succeedingConferenceOrEvent | 16963 |
gnd#dateOfProduction | 15106 |
gnd#spatialAreaOfActivity | 13593 |
gnd#hierarchicalSuperiorOfTheConferenceOrEvent | 10126 |
gnd#architect | 8206 |
gnd#topic | 6088 |
gnd#relatedPlaceOrGeographicName | 5812 |
gnd#abbreviatedNameForTheConferenceOrEvent | 5312 |
gnd#complexSeeReferenceSubject | 3927 |
gnd#startingOrFinalPointOfADistance | 3531 |
gnd#relatedConferenceOrEvent | 3124 |
gnd#relatedDdcWithDegreeOfDeterminacy4 | 2709 |
gnd#relatedCorporateBody | 2354 |
gnd#relatedTerm | 1728 |
gnd#sponsorOrPatron | 1281 |
gnd#relatedDdcWithDegreeOfDeterminacy3 | 1200 |
gnd#exhibitor | 1044 |
done | 2020-06-25"2020-06-25" is not recognized as a Boolean (true/false) value. |
---|---|
todo | Unfortunately the headlines and the forms seem to be mixed up. |
events with most often used fields and seldom but useful fields
# get events with most often used columns from GND
# plus acronym, topic, homepage (seldom but useful)
# WF 2020-07-12
PREFIX gndi: <https://d-nb.info/gnd>
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>
SELECT ?event ?eventId ?acronym ?variant ?name ?date ?areaCode ?place ?topic ?homepage
WHERE {
?event gnd:gndIdentifier ?eventId.
OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?acronym. }
OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?variant.}
OPTIONAL { ?event gnd:preferredNameForTheConferenceOrEvent ?name.}
OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?date. }
OPTIONAL { ?event gnd:geographicAreaCode ?areaCode. }
OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?place. }
OPTIONAL { ?event gnd:topic ?topic. }
OPTIONAL { ?event gnd:homepage ?homepage. }
}
#LIMIT 10000
Script to setup a Jena instance with GND data extract
gnd2jena
#!/bin/bash
# WF 2020-05-10
# global settings
jena=apache-jena-4.4.0
tgz=$jena.tar.gz
#mirror=https://downloads.apache.org/jena/binaries
mirror=https://archive.apache.org/dist/jena/binaries
jenaurl=$mirror/$tgz
base=/hd/seel/gnd
#base=/hd/torterra/gnd
#base=/hd/luxio/gnd
data=$base/data
tdbloader=$jena/bin/tdb2.tdbloader
getjena() {
# download
if [ ! -f $tgz ]
then
echo "downloading $tgz from $jenaurl"
wget $jenaurl
else
echo "$tgz already downloaded"
fi
# unpack
if [ ! -d $jena ]
then
echo "unpacking $jena from $tgz"
tar xvzf $tgz
else
echo "$jena already unpacked"
fi
# create data directory
if [ ! -d $data ]
then
echo "creating $data directory"
mkdir -p $data
else
echo "$data directory already created"
fi
}
#
# show the given timestamp
#
timestamp() {
local msg="$1"
local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
echo "$msg at $ts"
}
#
# load data for the given data dir and input
#
loaddata() {
local data="$1"
local input="$2"
timestamp "start loading $input to $data"
$tdbloader --loc "$data" "$input" > tdb2-$phase-out.log 2> tdb2-$phase-err.log
timestamp "finished loading $input to $data"
}
getjena
export TMPDIR=$base/tmp
if [ ! -d $TMPDIR ]
then
echo "creating temporary directory $TMPDIR"
mkdir $TMPDIR
else
echo "using temporary directory $TMPDIR"
fi
for d in kongress geografikum
do
file=authorities-${d}_lds.ttl
if [ ! -f $file ]
then
wget https://data.dnb.de/opendata/$file.gz
gunzip $file.gz
else
echo "$file already downloaded"
fi
loaddata $data $file
done
try on confident23 server
2020-07-19
wf@confident23:/usr/local/src$ sudo ./getjena
apache-jena-3.16.0.tar.gz already downloaded
apache-jena-3.16.0 already unpacked
creating /var/data/gnd/data directory
creating temporary directory /var/data/gnd/tmp
start loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-07-19T07:08:15Z
finished loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-07-19T07:11:16Z
2020-09-15
./getjena
apache-jena-3.16.0.tar.gz already downloaded
apache-jena-3.16.0 already unpacked
creating /var/data/gnd/data directory
creating temporary directory /var/data/gnd/tmp
--2020-09-15 11:46:37-- https://data.dnb.de/opendata/authorities-kongress_lds.ttl.gz
Resolving data.dnb.de (data.dnb.de)... 193.175.100.140
Connecting to data.dnb.de (data.dnb.de)|193.175.100.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80381713 (77M) [application/x-gzip]
Saving to: ‘authorities-kongress_lds.ttl.gz’
authorities-kongres 100%[===================>] 76.66M 66.3MB/s in 1.2s
2020-09-15 11:46:38 (66.3 MB/s) - ‘authorities-kongress_lds.ttl.gz’ saved [80381713/80381713]
start loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-09-15T09:46:42Z
finished loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-09-15T09:50:25Z
--2020-09-15 11:50:25-- https://data.dnb.de/opendata/authorities-geografikum_lds.ttl.gz
Resolving data.dnb.de (data.dnb.de)... 193.175.100.140
Connecting to data.dnb.de (data.dnb.de)|193.175.100.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33842092 (32M) [application/x-gzip]
Saving to: ‘authorities-geografikum_lds.ttl.gz’
authorities-geograf 100%[===================>] 32.27M 53.4MB/s in 0.6s
2020-09-15 11:50:26 (53.4 MB/s) - ‘authorities-geografikum_lds.ttl.gz’ saved [33842092/33842092]
start loading authorities-geografikum_lds.ttl to /var/data/gnd/data at 2020-09-15T09:50:27Z
finished loading authorities-geografikum_lds.ttl to /var/data/gnd/data at 2020-09-15T09:52:40Z
start fuseki
#!/bin/bash
# WF 2020-06-25
# Jena Fuseki server installation
# see https://jena.apache.org/documentation/fuseki2/fuseki-run.html
version=3.16.0
fuseki=apache-jena-fuseki-$version
if [ ! -d $fuseki ]
then
if [ ! -f $fuseki.tar.gz ]
then
wget http://archive.apache.org/dist/jena/binaries/$fuseki.tar.gz
else
echo $fuseki.tar.gz already downloaded
fi
echo "unpacking $fuseki.tar.gz"
tar xvfz $fuseki.tar.gz
else
echo $fuseki already downloaded and unpacked
fi
cd $fuseki
gnddata=/var/data/gnd/data
java -jar fuseki-server.jar --tdb2 --loc=$gnddata /gnd
wf@confident23:/usr/local/src$