Difference between revisions of "Truly Tabular RDF/GND"

From BITPlan Wiki
Jump to navigation Jump to search
Line 20: Line 20:
 
* https://blog.lobid.org/2018/08/27/openrefine.html
 
* https://blog.lobid.org/2018/08/27/openrefine.html
 
== GND property multiplicity ==
 
== GND property multiplicity ==
The following table shows the cardinality/multiplicity of properties per conference record. E.g. there are up to four different acronyms for a conference.
 
 
{| class="wikitable"
 
{| class="wikitable"
 
|+ Property multiplicity
 
|+ Property multiplicity
Line 122: Line 121:
 
| style="                "| 0.0163
 
| style="                "| 0.0163
 
|}
 
|}
 +
=== performance optimized query of GND event details ===
 +
<source lang='sparql>
 +
# performance optimized query of GND event details
 +
# with aggregated properties as single, count and | separated list column
 +
# WF 2021-12-05
 +
PREFIX gndi:  <https://d-nb.info/gnd>
 +
PREFIX gnd:  <https://d-nb.info/standards/elementset/gnd#>
 +
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
 +
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
 +
PREFIX owl: <http://www.w3.org/2002/07/owl#>
 +
PREFIX dc: <http://purl.org/dc/terms/>
 +
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>
 +
 +
SELECT 
 +
  ?event
 +
  ?eventId 
 +
  (MIN(?eventTitle) as ?title)
 +
 +
  (COUNT (DISTINCT ?eventDate) as ?dateCount)
 +
  (MIN(?eventDate) as ?date)
 +
 +
  (MIN(?eventAcronym) as ?acronym)
 +
  (COUNT (DISTINCT ?eventAcronym) as ?acronymCount)
 +
  (GROUP_CONCAT(DISTINCT ?eventAcronym; SEPARATOR="| ") AS ?acronyms)
 +
 +
  (MIN(?eventVariant) as ?variant)
 +
  (COUNT (DISTINCT ?eventVariant) as ?variantCount)
 +
  (GROUP_CONCAT(DISTINCT ?eventVariant; SEPARATOR="| ") AS ?variants)
 +
 +
  (MIN(?eventPlace) as ?place)
 +
  (COUNT (DISTINCT ?eventPlace) as ?placeCount)
 +
  (GROUP_CONCAT(DISTINCT ?eventPlace; SEPARATOR="| ") AS ?places)
 +
 +
  (MIN(?eventHomepage) as ?homepage)
 +
WHERE {
 +
  ?event a gnd:ConferenceOrEvent.
 +
  ?event gnd:gndIdentifier ?eventId.
 +
  ?event gnd:preferredNameForTheConferenceOrEvent ?eventTitle.
 +
  OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?eventAcronym. }
 +
  OPTIONAL { ?event gnd:homepage ?eventHomepage. }
 +
  OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?eventVariant. }
 +
  OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?eventDate. }
 +
  OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?eventPlace }
 +
  # only available 3520 times 2021-12
 +
  # ?event gnd:topic ?topic.
 +
  # only available 12106 times 2021-12
 +
  # ?event gnd:precedingConferenceOrEvent ?prec
 +
  # only available 11929 times 2021-12
 +
  #?event gnd:succeedingConferenceOrEvent ?succ
 +
}
 +
GROUP BY ?event ?eventId
 +
</source>
 
=== query to analyze multiplicity ===
 
=== query to analyze multiplicity ===
 
<source lang='sparql>
 
<source lang='sparql>
Line 161: Line 212:
 
</source>
 
</source>
  
== GND Query ==
+
== SPARQL Queries ==
<source lang='sparql'>
+
=== entities and usage frequency ===
# performance optimized query of GND event details
+
<source lang='SPARQL'>
        # with aggregated properties as single, count and | separated list column
+
# get histogramm data of entities by
        # WF 2021-12-05
+
# usage frequency
        PREFIX gndi:  <https://d-nb.info/gnd>
+
# WF 2020-06-27
        PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
+
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
        PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
 
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
 
        PREFIX owl: <http://www.w3.org/2002/07/owl#>
 
        PREFIX dc: <http://purl.org/dc/terms/>
 
        PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>
 
  
        SELECT   
+
SELECT ?c (COUNT(?c) AS ?count)
          ?event
+
WHERE {
          ?eventId 
+
  ?subject a  ?c
          (MIN(?eventTitle) as ?fulltitle)
+
}
 +
GROUP BY ?c
 +
HAVING (?count >100)
 +
ORDER BY DESC(?count)
 +
</source>
 +
{|class="wikitable sortable"
 +
|+
 +
!c!!count
 +
|-
 +
||gnd#ConferenceOrEvent||713310
 +
|-
 +
||gnd#TerritorialCorporateBodyOrAdministrativeUnit||188246
 +
|-
 +
||gnd#SeriesOfConferenceOrEvent||122970
 +
|-
 +
||gnd#BuildingOrMemorial||67149
 +
|-
 +
||http://www.opengis.net/ont/sf#Point||57987
 +
|-
 +
||gnd#PlaceOrGeographicName||27771
 +
|-
 +
||gnd#NaturalGeographicUnit||20269
 +
|-
 +
||gnd#AdministrativeUnit||12846
 +
|-
 +
||gnd#WayBorderOrLine||4971
 +
|-
 +
||gnd#ReligiousTerritory||2646
 +
|-
 +
||gnd#NameOfSmallGeographicUnitLyingWithinAnotherGeographicUnit||2113
 +
|-
 +
||gnd#CorporateBody||559
 +
|-
 +
||gnd#MemberState||543
 +
|-
 +
||gnd#Country||307
 +
|-
 +
||gnd#ExtraterrestrialTerritory||282
 +
|-
 +
||gnd#Language||193
 +
|-
 +
||gnd#ReligiousCorporateBody||155
 +
|-
 +
||gnd#ReligiousAdministrativeUnit||134
 +
|-
 +
||gnd#HistoricSingleEventOrEra||128
 +
|}
  
          (COUNT (DISTINCT ?eventDate) as ?dateCount)
+
=== relevance of fields ===
          (MIN(?eventDate) as ?date)
+
<source lang='SPARQL'>
 +
# get histogramm data of properties by
 +
# usage frequency
 +
# WF 2020-07-12
 +
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
 +
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
 +
PREFIX owl: <http://www.w3.org/2002/07/owl#>
 +
PREFIX dc: <http://purl.org/dc/terms/>
 +
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>
  
          (MIN(?eventAcronym) as ?acronym)
+
SELECT ?property (COUNT(?property) AS ?propTotal)
          (COUNT (DISTINCT ?eventAcronym) as ?acronymCount)
+
WHERE { ?s ?property ?o . }
          (GROUP_CONCAT(DISTINCT ?eventAcronym; SEPARATOR="| ") AS ?acronyms)
+
GROUP BY ?property
 
+
HAVING (?propTotal >1000)
          (MIN(?eventVariant) as ?variant)
+
ORDER BY DESC(?propTotal)
          (COUNT (DISTINCT ?eventVariant) as ?variantCount)
 
          (GROUP_CONCAT(DISTINCT ?eventVariant; SEPARATOR="| ") AS ?variants)
 
 
 
          (MIN(?eventPlace) as ?place)
 
          (COUNT (DISTINCT ?eventPlace) as ?placeCount)
 
          (GROUP_CONCAT(DISTINCT ?eventPlace; SEPARATOR="| ") AS ?places)  
 
 
 
          (MIN(?eventHomepage) as ?homepage)
 
        WHERE {
 
          ?event a gnd:ConferenceOrEvent.
 
          ?event gnd:gndIdentifier ?eventId.
 
          ?event gnd:preferredNameForTheConferenceOrEvent ?eventTitle.
 
          OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?eventAcronym. }
 
          OPTIONAL { ?event gnd:homepage ?eventHomepage. }
 
          OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?eventVariant. }
 
          OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?eventDate. }
 
          OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?eventPlace }
 
          # only available 3520 times 2021-12
 
          # ?event gnd:topic ?topic.
 
          # only available 12106 times 2021-12
 
          # ?event gnd:precedingConferenceOrEvent ?prec
 
          # only available 11929 times 2021-12
 
          #?event gnd:succeedingConferenceOrEvent ?succ
 
        }
 
        GROUP BY ?event ?eventId
 
 
</source>
 
</source>
== dateCardinality after import to relational database ==
+
{|class="wikitable sortable"
=== query ===
+
|+
<source lang='sql'>
+
!property!!propTotal
select count(dateCount)
+
|-
from event_gnd
+
||http://www.w3.org/2002/07/owl#sameAs||1359115
group by dateCount
+
|-
order by 1 desc
+
||http://www.w3.org/1999/02/22-rdf-syntax-ns#type||1222895
</source>
+
|-
 
+
||gnd#geographicAreaCode||1195102
=== result ===
+
|-
{| class="wikitable" style="text-align: left;"
+
||http://purl.org/dc/terms/license||1149468
|+ <!-- caption -->
+
|-
 +
||http://purl.org/dc/terms/modified||1149468
 +
|-
 +
||http://www.w3.org/2007/05/powder-s#describedby||1149468
 +
|-
 +
||gnd#gndIdentifier||1149468
 +
|-
 +
||gnd#oldAuthorityNumber||940550
 +
|-
 +
||gnd#preferredNameForTheConferenceOrEvent||836397
 +
|-
 +
||gnd#variantNameForTheConferenceOrEvent||720355
 +
|-
 +
||gnd#dateOfConferenceOrEvent||693196
 +
|-
 +
||gnd#placeOfConferenceOrEvent||650165
 +
|-
 +
||gnd#preferredNameForThePlaceOrGeographicName||313058
 +
|-
 +
||gnd#variantNameForThePlaceOrGeographicName||298383
 +
|-
 +
||gnd#gndSubjectCategory||176395
 +
|-
 +
||gnd#definition||157695
 +
|-
 +
||gnd#broaderTermInstantial||117334
 +
|-
 +
||gnd#place||82345
 +
|-
 +
||http://xmlns.com/foaf/0.1/page||74542
 +
|-
 +
||http://www.opengis.net/ont/geosparql#asWKT||58083
 +
|-
 +
||http://www.opengis.net/ont/geosparql#hasGeometry||58083
 +
|-
 +
||https://d-nb.info/standards/elementset/dnb#deprecatedUri||51576
 +
|-
 +
||gnd#biographicalOrHistoricalInformation||34190
 +
|-
 +
||gnd#organizerOrHost||34167
 +
|-
 +
||gnd#relatedDdcWithDegreeOfDeterminacy2||25452
 +
|-
 +
||gnd#succeedingPlaceOrGeographicName||25208
 +
|-
 +
||gnd#broaderTermPartitive||25111
 +
|-
 +
||gnd#dateOfEstablishment||24772
 +
|-
 +
||gnd#homepage||24439
 +
|-
 +
||gnd#precedingPlaceOrGeographicName||22490
 +
|-
 +
||gnd#dateOfTermination||21582
 +
|-
 +
||gnd#hierarchicalSuperiorOfPlaceOrGeographicName||20522
 +
|-
 +
||gnd#precedingConferenceOrEvent||17164
 +
|-
 +
||gnd#succeedingConferenceOrEvent||16963
 +
|-
 +
||gnd#dateOfProduction||15106
 +
|-
 +
||gnd#spatialAreaOfActivity||13593
 +
|-
 +
||gnd#hierarchicalSuperiorOfTheConferenceOrEvent||10126
 +
|-
 +
||gnd#architect||8206
 +
|-
 +
||gnd#topic||6088
 
|-
 
|-
! align="right"|   count(dateCount)
+
||gnd#relatedPlaceOrGeographicName||5812
 
|-
 
|-
| align="right"|             715987
+
||gnd#abbreviatedNameForTheConferenceOrEvent||5312
 
|-
 
|-
| align="right"|             23948
+
||gnd#complexSeeReferenceSubject||3927
 
|-
 
|-
| align="right"|               2815
+
||gnd#startingOrFinalPointOfADistance||3531
 
|-
 
|-
| align="right"|               141
+
||gnd#relatedConferenceOrEvent||3124
 
|-
 
|-
| align="right"|                 30
+
||gnd#relatedDdcWithDegreeOfDeterminacy4||2709
 
|-
 
|-
| align="right"|                 5
+
||gnd#relatedCorporateBody||2354
 
|-
 
|-
| align="right"|                 4
+
||gnd#relatedTerm||1728
 
|-
 
|-
| align="right"|                 3
+
||gnd#sponsorOrPatron||1281
 
|-
 
|-
| align="right"|                 1
+
||gnd#relatedDdcWithDegreeOfDeterminacy3||1200
 
|-
 
|-
| align="right"|                 1
+
||gnd#exhibitor||1044
 
|}
 
|}
}}
+
{{Fixme|todo=Unfortunately the headlines and the forms seem to be mixed up.|done=2020-06-25|by=--[[User:Wf|Wf]] ([[User talk:Wf|talk]]) 14:59, 25 June 2020 (CEST)}}
 +
 
 +
== events with most often used fields and seldom but useful fields ==
 +
<source lang='sparql'>
 +
# get events with most often used columns from GND
 +
# plus acronym, topic, homepage (seldom but useful)
 +
# WF 2020-07-12
 +
PREFIX gndi:  <https://d-nb.info/gnd>
 +
PREFIX gnd:  <https://d-nb.info/standards/elementset/gnd#>
 +
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
 +
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
 +
PREFIX owl: <http://www.w3.org/2002/07/owl#>
 +
PREFIX dc: <http://purl.org/dc/terms/>
 +
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>
 +
 
 +
SELECT  ?event ?eventId ?acronym  ?variant ?name ?date ?areaCode ?place ?topic ?homepage
 +
WHERE {
 +
  ?event gnd:gndIdentifier ?eventId.
 +
  OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?acronym. }
 +
  OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?variant.}
 +
  OPTIONAL { ?event gnd:preferredNameForTheConferenceOrEvent ?name.}
 +
  OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?date. }
 +
  OPTIONAL { ?event gnd:geographicAreaCode ?areaCode. }
 +
  OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?place. }
 +
  OPTIONAL { ?event gnd:topic ?topic. }
 +
  OPTIONAL { ?event gnd:homepage ?homepage. }
 +
}
 +
#LIMIT 10000
 +
</source>
 +
 
 +
== Script to setup a Jena instance with GND data extract ==
 +
=== gnd2jena ===
 +
<source lang='bash'>
 +
#!/bin/bash
 +
# WF 2020-05-10
 +
 
 +
# global settings
 +
jena=apache-jena-4.4.0
 +
tgz=$jena.tar.gz
 +
#mirror=https://downloads.apache.org/jena/binaries
 +
mirror=https://archive.apache.org/dist/jena/binaries
 +
jenaurl=$mirror/$tgz
 +
base=/hd/seel/gnd
 +
#base=/hd/torterra/gnd
 +
#base=/hd/luxio/gnd
 +
data=$base/data
 +
tdbloader=$jena/bin/tdb2.tdbloader
 +
 
 +
getjena() {
 +
# download
 +
if [ ! -f $tgz ]
 +
then
 +
  echo "downloading $tgz from $jenaurl"
 +
  wget $jenaurl
 +
else
 +
  echo "$tgz already downloaded"
 +
fi
 +
# unpack
 +
if [ ! -d $jena ]
 +
then
 +
  echo "unpacking $jena from $tgz"
 +
  tar xvzf $tgz
 +
else
 +
  echo "$jena already unpacked"
 +
fi
 +
# create data directory
 +
if [ ! -d $data ]
 +
then
 +
  echo "creating $data directory"
 +
  mkdir -p $data
 +
else
 +
  echo "$data directory already created"
 +
fi
 +
}
 +
 
 +
#
 +
# show the given timestamp
 +
#
 +
timestamp() {
 +
local msg="$1"
 +
local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
 +
echo "$msg at $ts"
 +
}
 +
 
 +
#
 +
# load data for the given data dir and input
 +
#
 +
loaddata() {
 +
  local data="$1"
 +
  local input="$2"
 +
  timestamp "start loading $input to $data"
 +
  $tdbloader --loc "$data" "$input" > tdb2-$phase-out.log 2> tdb2-$phase-err.log
 +
  timestamp "finished loading $input to $data"
 +
}
 +
 
 +
getjena
 +
export TMPDIR=$base/tmp
 +
if [ ! -d $TMPDIR ]
 +
then
 +
  echo "creating temporary directory $TMPDIR"
 +
  mkdir $TMPDIR
 +
else
 +
  echo "using temporary directory $TMPDIR"
 +
fi
 +
for d in kongress geografikum
 +
do
 +
  file=authorities-${d}_lds.ttl
 +
  if [ ! -f $file ]
 +
  then
 +
    wget https://data.dnb.de/opendata/$file.gz
 +
    gunzip $file.gz
 +
  else
 +
    echo "$file already downloaded"
 +
  fi
 +
  loaddata $data $file
 +
done
 +
</source>
 +
 
 +
== try on confident23 server ==
 +
=== 2020-07-19 ===
 +
<source lang='bash'>
 +
wf@confident23:/usr/local/src$ sudo ./getjena
 +
apache-jena-3.16.0.tar.gz already downloaded
 +
apache-jena-3.16.0 already unpacked
 +
creating /var/data/gnd/data directory
 +
creating temporary directory /var/data/gnd/tmp
 +
start loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-07-19T07:08:15Z
 +
finished loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-07-19T07:11:16Z
 +
</source>
 +
=== 2020-09-15 ===
 +
<source lang='bash'>
 +
./getjena
 +
apache-jena-3.16.0.tar.gz already downloaded
 +
apache-jena-3.16.0 already unpacked
 +
creating /var/data/gnd/data directory
 +
creating temporary directory /var/data/gnd/tmp
 +
--2020-09-15 11:46:37--  https://data.dnb.de/opendata/authorities-kongress_lds.ttl.gz
 +
Resolving data.dnb.de (data.dnb.de)... 193.175.100.140
 +
Connecting to data.dnb.de (data.dnb.de)|193.175.100.140|:443... connected.
 +
HTTP request sent, awaiting response... 200 OK
 +
Length: 80381713 (77M) [application/x-gzip]
 +
Saving to: ‘authorities-kongress_lds.ttl.gz’
 +
 
 +
authorities-kongres 100%[===================>]  76.66M  66.3MB/s    in 1.2s   
 +
 
 +
2020-09-15 11:46:38 (66.3 MB/s) - ‘authorities-kongress_lds.ttl.gz’ saved [80381713/80381713]
 +
 
 +
start loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-09-15T09:46:42Z
 +
finished loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-09-15T09:50:25Z
 +
--2020-09-15 11:50:25--  https://data.dnb.de/opendata/authorities-geografikum_lds.ttl.gz
 +
Resolving data.dnb.de (data.dnb.de)... 193.175.100.140
 +
Connecting to data.dnb.de (data.dnb.de)|193.175.100.140|:443... connected.
 +
HTTP request sent, awaiting response... 200 OK
 +
Length: 33842092 (32M) [application/x-gzip]
 +
Saving to: ‘authorities-geografikum_lds.ttl.gz’
 +
 
 +
authorities-geograf 100%[===================>]  32.27M  53.4MB/s    in 0.6s   
 +
 
 +
2020-09-15 11:50:26 (53.4 MB/s) - ‘authorities-geografikum_lds.ttl.gz’ saved [33842092/33842092]
 +
 
 +
start loading authorities-geografikum_lds.ttl to /var/data/gnd/data at 2020-09-15T09:50:27Z
 +
finished loading authorities-geografikum_lds.ttl to /var/data/gnd/data at 2020-09-15T09:52:40Z
 +
 
 +
</source>
 +
 
 +
== start fuseki ==
 +
<source lang='bash'>
 +
#!/bin/bash
 +
# WF 2020-06-25
 +
# Jena Fuseki server installation
 +
# see https://jena.apache.org/documentation/fuseki2/fuseki-run.html
 +
version=3.16.0
 +
fuseki=apache-jena-fuseki-$version
 +
if [ ! -d $fuseki ]
 +
then
 +
if [ ! -f $fuseki.tar.gz ]
 +
then
 +
wget http://archive.apache.org/dist/jena/binaries/$fuseki.tar.gz
 +
else
 +
echo $fuseki.tar.gz already downloaded
 +
fi
 +
echo "unpacking $fuseki.tar.gz"
 +
tar xvfz $fuseki.tar.gz
 +
else
 +
echo $fuseki already downloaded and unpacked
 +
fi
 +
cd $fuseki
 +
gnddata=/var/data/gnd/data
 +
java -jar fuseki-server.jar --tdb2 --loc=$gnddata /gnd
 +
wf@confident23:/usr/local/src$
 +
</source>

Revision as of 08:37, 5 August 2022

GND de

GlossaryEntry
edit
responsible  https://www.dnb.de/DE/Professionell/Standardisierung/GND/gnd.html
state  
since  2012
description  Gemeinsame Normdatei
references  
lang  de
master  GND

Wikibase

Report "GND meets Wikibase" 2 Barbara Fischer

Confluence

Links

GND property multiplicity

Property multiplicity
property gnd total unique min max avg
eventId gnd:gndIdentifier 731651 731651 1 1 1
title gnd:preferredNameForTheConferenceOrEvent 731645 731645 0 1 0.999991799
acronym gnd:abbreviatedNameForTheConferenceOrEvent 3537 3206 0 4 0.00483
sameAs owl:sameAs 769120 693077 0 20 1.05
variant gnd:variantNameForTheConferenceOrEvent 632368 229268 0 41 0.86
date gnd:dateOfConferenceOrEvent 710819 704949 0 9 0.971
areaCode gnd:geographicAreaCode 797037 612631 0 11 1.089
place gnd:placeOfConferenceOrEvent 659305 624667 0 18 0.901
topic gnd:topic 5061 3520 0 6 0.00691
homepage gnd:homepage 19011 18702 0 3 0.026
prec gnd:homepage 12182 12106 0 3 0.0166
succ gnd:homepage 11974 11929 0 3 0.0163

performance optimized query of GND event details

# performance optimized query of GND event details
# with aggregated properties as single, count and | separated list column
# WF 2021-12-05
PREFIX gndi:  <https://d-nb.info/gnd>
PREFIX gnd:  <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>

SELECT  
   ?event 
   ?eventId  
   (MIN(?eventTitle) as ?title)

   (COUNT (DISTINCT ?eventDate) as ?dateCount)
   (MIN(?eventDate) as ?date)

   (MIN(?eventAcronym) as ?acronym)
   (COUNT (DISTINCT ?eventAcronym) as ?acronymCount)
   (GROUP_CONCAT(DISTINCT ?eventAcronym; SEPARATOR="| ") AS ?acronyms)

   (MIN(?eventVariant) as ?variant)
   (COUNT (DISTINCT ?eventVariant) as ?variantCount)
   (GROUP_CONCAT(DISTINCT ?eventVariant; SEPARATOR="| ") AS ?variants) 

   (MIN(?eventPlace) as ?place)
   (COUNT (DISTINCT ?eventPlace) as ?placeCount)
   (GROUP_CONCAT(DISTINCT ?eventPlace; SEPARATOR="| ") AS ?places) 

   (MIN(?eventHomepage) as ?homepage)
WHERE {
  ?event a gnd:ConferenceOrEvent.
  ?event gnd:gndIdentifier ?eventId.
  ?event gnd:preferredNameForTheConferenceOrEvent ?eventTitle.
  OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?eventAcronym. }
  OPTIONAL { ?event gnd:homepage ?eventHomepage. }
  OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?eventVariant. }
  OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?eventDate. }
  OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?eventPlace }
  # only available 3520 times 2021-12
  # ?event gnd:topic ?topic.
  # only available 12106 times 2021-12
  # ?event gnd:precedingConferenceOrEvent ?prec
  # only available 11929 times 2021-12
  #?event gnd:succeedingConferenceOrEvent ?succ
}
GROUP BY ?event ?eventId

query to analyze multiplicity

# get aggregate counts of property usage 
# this query needs to be modified property by property and run twice
# once without the having clause and once with the having clause 
# to create the table further down in this wikipage
# WF 2021-12-05
PREFIX gndi:  <https://d-nb.info/gnd>
PREFIX gnd:  <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>

SELECT (sum (?itemCount) as ?sum) (min (?itemCount) as ?min) (max (?itemCount) as ?max) (avg (?itemCount) as ?avg)  {
  SELECT  ?event ?eventId (count(?title) as ?itemCount)
  WHERE {
    ?event a gnd:ConferenceOrEvent.
    ?event gnd:gndIdentifier ?eventId.
    OPTIONAL {
      # ?event gnd:abbreviatedNameForTheConferenceOrEvent ?acronym. 
      # ?event owl:sameAs ?sameAs.
      # ?event gnd:variantNameForTheConferenceOrEvent ?variant.
      ?event gnd:preferredNameForTheConferenceOrEvent ?title.
      # ?event gnd:dateOfConferenceOrEvent ?date
      # ?event gnd:geographicAreaCode ?areaCode.
      # ?event gnd:placeOfConferenceOrEvent ?place.
      # ?event gnd:topic ?topic.
      # ?event gnd:homepage ?homepage. 
      # ?event gnd:precedingConferenceOrEvent ?prec
      # ?event gnd:succeedingConferenceOrEvent ?succ
    }
  }
  GROUP BY ?event ?eventId
  #HAVING(COUNT(?title) = 1)
}

SPARQL Queries

entities and usage frequency

# get histogramm data of entities by
# usage frequency
# WF 2020-06-27
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>

SELECT ?c  (COUNT(?c) AS ?count)
WHERE {
  ?subject a  ?c
}
GROUP BY ?c
HAVING (?count >100)
ORDER BY DESC(?count)
c count
gnd#ConferenceOrEvent 713310
gnd#TerritorialCorporateBodyOrAdministrativeUnit 188246
gnd#SeriesOfConferenceOrEvent 122970
gnd#BuildingOrMemorial 67149
http://www.opengis.net/ont/sf#Point 57987
gnd#PlaceOrGeographicName 27771
gnd#NaturalGeographicUnit 20269
gnd#AdministrativeUnit 12846
gnd#WayBorderOrLine 4971
gnd#ReligiousTerritory 2646
gnd#NameOfSmallGeographicUnitLyingWithinAnotherGeographicUnit 2113
gnd#CorporateBody 559
gnd#MemberState 543
gnd#Country 307
gnd#ExtraterrestrialTerritory 282
gnd#Language 193
gnd#ReligiousCorporateBody 155
gnd#ReligiousAdministrativeUnit 134
gnd#HistoricSingleEventOrEra 128

relevance of fields

# get histogramm data of properties by
# usage frequency
# WF 2020-07-12
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>

SELECT ?property (COUNT(?property) AS ?propTotal)
WHERE { ?s ?property ?o . }
GROUP BY ?property
HAVING (?propTotal >1000)
ORDER BY DESC(?propTotal)
property propTotal
http://www.w3.org/2002/07/owl#sameAs 1359115
http://www.w3.org/1999/02/22-rdf-syntax-ns#type 1222895
gnd#geographicAreaCode 1195102
http://purl.org/dc/terms/license 1149468
http://purl.org/dc/terms/modified 1149468
http://www.w3.org/2007/05/powder-s#describedby 1149468
gnd#gndIdentifier 1149468
gnd#oldAuthorityNumber 940550
gnd#preferredNameForTheConferenceOrEvent 836397
gnd#variantNameForTheConferenceOrEvent 720355
gnd#dateOfConferenceOrEvent 693196
gnd#placeOfConferenceOrEvent 650165
gnd#preferredNameForThePlaceOrGeographicName 313058
gnd#variantNameForThePlaceOrGeographicName 298383
gnd#gndSubjectCategory 176395
gnd#definition 157695
gnd#broaderTermInstantial 117334
gnd#place 82345
http://xmlns.com/foaf/0.1/page 74542
http://www.opengis.net/ont/geosparql#asWKT 58083
http://www.opengis.net/ont/geosparql#hasGeometry 58083
https://d-nb.info/standards/elementset/dnb#deprecatedUri 51576
gnd#biographicalOrHistoricalInformation 34190
gnd#organizerOrHost 34167
gnd#relatedDdcWithDegreeOfDeterminacy2 25452
gnd#succeedingPlaceOrGeographicName 25208
gnd#broaderTermPartitive 25111
gnd#dateOfEstablishment 24772
gnd#homepage 24439
gnd#precedingPlaceOrGeographicName 22490
gnd#dateOfTermination 21582
gnd#hierarchicalSuperiorOfPlaceOrGeographicName 20522
gnd#precedingConferenceOrEvent 17164
gnd#succeedingConferenceOrEvent 16963
gnd#dateOfProduction 15106
gnd#spatialAreaOfActivity 13593
gnd#hierarchicalSuperiorOfTheConferenceOrEvent 10126
gnd#architect 8206
gnd#topic 6088
gnd#relatedPlaceOrGeographicName 5812
gnd#abbreviatedNameForTheConferenceOrEvent 5312
gnd#complexSeeReferenceSubject 3927
gnd#startingOrFinalPointOfADistance 3531
gnd#relatedConferenceOrEvent 3124
gnd#relatedDdcWithDegreeOfDeterminacy4 2709
gnd#relatedCorporateBody 2354
gnd#relatedTerm 1728
gnd#sponsorOrPatron 1281
gnd#relatedDdcWithDegreeOfDeterminacy3 1200
gnd#exhibitor 1044
done 2020-06-25"2020-06-25" is not recognized as a Boolean (true/false) value.
todo Unfortunately the headlines and the forms seem to be mixed up.


events with most often used fields and seldom but useful fields

# get events with most often used columns from GND
# plus acronym, topic, homepage (seldom but useful)
# WF 2020-07-12
PREFIX gndi:  <https://d-nb.info/gnd>
PREFIX gnd:  <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>

SELECT  ?event ?eventId ?acronym  ?variant ?name ?date ?areaCode ?place ?topic ?homepage 
WHERE {
  ?event gnd:gndIdentifier ?eventId.
  OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?acronym. }
  OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?variant.}
  OPTIONAL { ?event gnd:preferredNameForTheConferenceOrEvent ?name.}
  OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?date. }
  OPTIONAL { ?event gnd:geographicAreaCode ?areaCode. }
  OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?place. }
  OPTIONAL { ?event gnd:topic ?topic. }
  OPTIONAL { ?event gnd:homepage ?homepage. }
}
#LIMIT 10000

Script to setup a Jena instance with GND data extract

gnd2jena

#!/bin/bash
# WF 2020-05-10

# global settings
jena=apache-jena-4.4.0
tgz=$jena.tar.gz
#mirror=https://downloads.apache.org/jena/binaries
mirror=https://archive.apache.org/dist/jena/binaries
jenaurl=$mirror/$tgz
base=/hd/seel/gnd
#base=/hd/torterra/gnd
#base=/hd/luxio/gnd
data=$base/data
tdbloader=$jena/bin/tdb2.tdbloader

getjena() {
# download
if [ ! -f $tgz ]
then
  echo "downloading $tgz from $jenaurl"
  wget $jenaurl
else
  echo "$tgz already downloaded"
fi
# unpack
if [ ! -d $jena ]
then
  echo "unpacking $jena from $tgz"
  tar xvzf $tgz
else
  echo "$jena already unpacked"
fi
# create data directory
if [ ! -d $data ]
then
  echo "creating $data directory"
  mkdir -p $data
else
  echo "$data directory already created"
fi
}

#
# show the given timestamp
#
timestamp() {
 local msg="$1"
 local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
 echo "$msg at $ts"
}

#
# load data for the given data dir and input
#
loaddata() {
  local data="$1"
  local input="$2"
  timestamp "start loading $input to $data"
  $tdbloader --loc "$data" "$input" > tdb2-$phase-out.log 2> tdb2-$phase-err.log
  timestamp "finished loading $input to $data"
}

getjena
export TMPDIR=$base/tmp
if [ ! -d $TMPDIR ]
then
  echo "creating temporary directory $TMPDIR"
  mkdir $TMPDIR
else
  echo "using temporary directory $TMPDIR"
fi
for d in kongress geografikum
do
  file=authorities-${d}_lds.ttl
  if [ ! -f $file ]
  then
    wget https://data.dnb.de/opendata/$file.gz
    gunzip $file.gz
  else
    echo "$file already downloaded"
  fi
  loaddata $data $file 
done

try on confident23 server

2020-07-19

wf@confident23:/usr/local/src$ sudo ./getjena 
apache-jena-3.16.0.tar.gz already downloaded
apache-jena-3.16.0 already unpacked
creating /var/data/gnd/data directory
creating temporary directory /var/data/gnd/tmp
start loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-07-19T07:08:15Z
finished loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-07-19T07:11:16Z

2020-09-15

./getjena 
apache-jena-3.16.0.tar.gz already downloaded
apache-jena-3.16.0 already unpacked
creating /var/data/gnd/data directory
creating temporary directory /var/data/gnd/tmp
--2020-09-15 11:46:37--  https://data.dnb.de/opendata/authorities-kongress_lds.ttl.gz
Resolving data.dnb.de (data.dnb.de)... 193.175.100.140
Connecting to data.dnb.de (data.dnb.de)|193.175.100.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80381713 (77M) [application/x-gzip]
Saving to: ‘authorities-kongress_lds.ttl.gz’

authorities-kongres 100%[===================>]  76.66M  66.3MB/s    in 1.2s    

2020-09-15 11:46:38 (66.3 MB/s) - ‘authorities-kongress_lds.ttl.gz’ saved [80381713/80381713]

start loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-09-15T09:46:42Z
finished loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-09-15T09:50:25Z
--2020-09-15 11:50:25--  https://data.dnb.de/opendata/authorities-geografikum_lds.ttl.gz
Resolving data.dnb.de (data.dnb.de)... 193.175.100.140
Connecting to data.dnb.de (data.dnb.de)|193.175.100.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33842092 (32M) [application/x-gzip]
Saving to: ‘authorities-geografikum_lds.ttl.gz’

authorities-geograf 100%[===================>]  32.27M  53.4MB/s    in 0.6s    

2020-09-15 11:50:26 (53.4 MB/s) - ‘authorities-geografikum_lds.ttl.gz’ saved [33842092/33842092]

start loading authorities-geografikum_lds.ttl to /var/data/gnd/data at 2020-09-15T09:50:27Z
finished loading authorities-geografikum_lds.ttl to /var/data/gnd/data at 2020-09-15T09:52:40Z

start fuseki

#!/bin/bash
# WF 2020-06-25
# Jena Fuseki server installation
# see https://jena.apache.org/documentation/fuseki2/fuseki-run.html
version=3.16.0
fuseki=apache-jena-fuseki-$version
if [ ! -d $fuseki ]
then
 if [ ! -f $fuseki.tar.gz ]
 then
 wget http://archive.apache.org/dist/jena/binaries/$fuseki.tar.gz
 else
 echo $fuseki.tar.gz already downloaded
 fi
 echo "unpacking $fuseki.tar.gz"
 tar xvfz $fuseki.tar.gz
else
 echo $fuseki already downloaded and unpacked
fi
cd $fuseki
gnddata=/var/data/gnd/data
java -jar fuseki-server.jar --tdb2 --loc=$gnddata /gnd
wf@confident23:/usr/local/src$