Truly Tabular RDF/GND: Difference between revisions

From BITPlan Wiki
Jump to navigation Jump to search
No edit summary
No edit summary
Line 20: Line 20:
* https://blog.lobid.org/2018/08/27/openrefine.html
* https://blog.lobid.org/2018/08/27/openrefine.html
== GND property multiplicity ==
== GND property multiplicity ==
The following table shows the cardinality/multiplicity of properties per conference record. E.g. there are up to four different acronyms for a conference.
{| class="wikitable"
{| class="wikitable"
|+ Property multiplicity
|+ Property multiplicity
Line 122: Line 121:
| style="                "| 0.0163
| style="                "| 0.0163
|}
|}
=== performance optimized query of GND event details ===
<source lang='sparql>
# performance optimized query of GND event details
# with aggregated properties as single, count and | separated list column
# WF 2021-12-05
PREFIX gndi:  <https://d-nb.info/gnd>
PREFIX gnd:  <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>
SELECT 
  ?event
  ?eventId 
  (MIN(?eventTitle) as ?title)
  (COUNT (DISTINCT ?eventDate) as ?dateCount)
  (MIN(?eventDate) as ?date)
  (MIN(?eventAcronym) as ?acronym)
  (COUNT (DISTINCT ?eventAcronym) as ?acronymCount)
  (GROUP_CONCAT(DISTINCT ?eventAcronym; SEPARATOR="| ") AS ?acronyms)
  (MIN(?eventVariant) as ?variant)
  (COUNT (DISTINCT ?eventVariant) as ?variantCount)
  (GROUP_CONCAT(DISTINCT ?eventVariant; SEPARATOR="| ") AS ?variants)
  (MIN(?eventPlace) as ?place)
  (COUNT (DISTINCT ?eventPlace) as ?placeCount)
  (GROUP_CONCAT(DISTINCT ?eventPlace; SEPARATOR="| ") AS ?places)
  (MIN(?eventHomepage) as ?homepage)
WHERE {
  ?event a gnd:ConferenceOrEvent.
  ?event gnd:gndIdentifier ?eventId.
  ?event gnd:preferredNameForTheConferenceOrEvent ?eventTitle.
  OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?eventAcronym. }
  OPTIONAL { ?event gnd:homepage ?eventHomepage. }
  OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?eventVariant. }
  OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?eventDate. }
  OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?eventPlace }
  # only available 3520 times 2021-12
  # ?event gnd:topic ?topic.
  # only available 12106 times 2021-12
  # ?event gnd:precedingConferenceOrEvent ?prec
  # only available 11929 times 2021-12
  #?event gnd:succeedingConferenceOrEvent ?succ
}
GROUP BY ?event ?eventId
</source>
=== query to analyze multiplicity ===
=== query to analyze multiplicity ===
<source lang='sparql>
<source lang='sparql>
Line 161: Line 212:
</source>
</source>


== GND Query ==
== SPARQL Queries ==
<source lang='sparql'>
=== entities and usage frequency ===
# performance optimized query of GND event details
<source lang='SPARQL'>
        # with aggregated properties as single, count and | separated list column
# get histogramm data of entities by
        # WF 2021-12-05
# usage frequency
        PREFIX gndi:  <https://d-nb.info/gnd>
# WF 2020-06-27
        PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
        PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX owl: <http://www.w3.org/2002/07/owl#>
        PREFIX dc: <http://purl.org/dc/terms/>
        PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>


        SELECT   
SELECT ?c (COUNT(?c) AS ?count)
          ?event
WHERE {
          ?eventId 
  ?subject a  ?c
          (MIN(?eventTitle) as ?fulltitle)
}
GROUP BY ?c
HAVING (?count >100)
ORDER BY DESC(?count)
</source>
{|class="wikitable sortable"
|+
!c!!count
|-
||gnd#ConferenceOrEvent||713310
|-
||gnd#TerritorialCorporateBodyOrAdministrativeUnit||188246
|-
||gnd#SeriesOfConferenceOrEvent||122970
|-
||gnd#BuildingOrMemorial||67149
|-
||http://www.opengis.net/ont/sf#Point||57987
|-
||gnd#PlaceOrGeographicName||27771
|-
||gnd#NaturalGeographicUnit||20269
|-
||gnd#AdministrativeUnit||12846
|-
||gnd#WayBorderOrLine||4971
|-
||gnd#ReligiousTerritory||2646
|-
||gnd#NameOfSmallGeographicUnitLyingWithinAnotherGeographicUnit||2113
|-
||gnd#CorporateBody||559
|-
||gnd#MemberState||543
|-
||gnd#Country||307
|-
||gnd#ExtraterrestrialTerritory||282
|-
||gnd#Language||193
|-
||gnd#ReligiousCorporateBody||155
|-
||gnd#ReligiousAdministrativeUnit||134
|-
||gnd#HistoricSingleEventOrEra||128
|}


          (COUNT (DISTINCT ?eventDate) as ?dateCount)
=== relevance of fields ===
          (MIN(?eventDate) as ?date)
<source lang='SPARQL'>
# get histogramm data of properties by
# usage frequency
# WF 2020-07-12
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>


          (MIN(?eventAcronym) as ?acronym)
SELECT ?property (COUNT(?property) AS ?propTotal)
          (COUNT (DISTINCT ?eventAcronym) as ?acronymCount)
WHERE { ?s ?property ?o . }
          (GROUP_CONCAT(DISTINCT ?eventAcronym; SEPARATOR="| ") AS ?acronyms)
GROUP BY ?property
 
HAVING (?propTotal >1000)
          (MIN(?eventVariant) as ?variant)
ORDER BY DESC(?propTotal)
          (COUNT (DISTINCT ?eventVariant) as ?variantCount)
          (GROUP_CONCAT(DISTINCT ?eventVariant; SEPARATOR="| ") AS ?variants)
 
          (MIN(?eventPlace) as ?place)
          (COUNT (DISTINCT ?eventPlace) as ?placeCount)
          (GROUP_CONCAT(DISTINCT ?eventPlace; SEPARATOR="| ") AS ?places)  
 
          (MIN(?eventHomepage) as ?homepage)
        WHERE {
          ?event a gnd:ConferenceOrEvent.
          ?event gnd:gndIdentifier ?eventId.
          ?event gnd:preferredNameForTheConferenceOrEvent ?eventTitle.
          OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?eventAcronym. }
          OPTIONAL { ?event gnd:homepage ?eventHomepage. }
          OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?eventVariant. }
          OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?eventDate. }
          OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?eventPlace }
          # only available 3520 times 2021-12
          # ?event gnd:topic ?topic.
          # only available 12106 times 2021-12
          # ?event gnd:precedingConferenceOrEvent ?prec
          # only available 11929 times 2021-12
          #?event gnd:succeedingConferenceOrEvent ?succ
        }
        GROUP BY ?event ?eventId
</source>
</source>
== dateCardinality after import to relational database ==
{|class="wikitable sortable"
=== query ===
|+
<source lang='sql'>
!property!!propTotal
select count(dateCount)
|-
from event_gnd
||http://www.w3.org/2002/07/owl#sameAs||1359115
group by dateCount
|-
order by 1 desc
||http://www.w3.org/1999/02/22-rdf-syntax-ns#type||1222895
</source>
|-
 
||gnd#geographicAreaCode||1195102
=== result ===
|-
{| class="wikitable" style="text-align: left;"
||http://purl.org/dc/terms/license||1149468
|+ <!-- caption -->
|-
||http://purl.org/dc/terms/modified||1149468
|-
||http://www.w3.org/2007/05/powder-s#describedby||1149468
|-
||gnd#gndIdentifier||1149468
|-
||gnd#oldAuthorityNumber||940550
|-
||gnd#preferredNameForTheConferenceOrEvent||836397
|-
||gnd#variantNameForTheConferenceOrEvent||720355
|-
||gnd#dateOfConferenceOrEvent||693196
|-
||gnd#placeOfConferenceOrEvent||650165
|-
||gnd#preferredNameForThePlaceOrGeographicName||313058
|-
||gnd#variantNameForThePlaceOrGeographicName||298383
|-
||gnd#gndSubjectCategory||176395
|-
||gnd#definition||157695
|-
||gnd#broaderTermInstantial||117334
|-
||gnd#place||82345
|-
||http://xmlns.com/foaf/0.1/page||74542
|-
||http://www.opengis.net/ont/geosparql#asWKT||58083
|-
||http://www.opengis.net/ont/geosparql#hasGeometry||58083
|-
||https://d-nb.info/standards/elementset/dnb#deprecatedUri||51576
|-
||gnd#biographicalOrHistoricalInformation||34190
|-
||gnd#organizerOrHost||34167
|-
||gnd#relatedDdcWithDegreeOfDeterminacy2||25452
|-
||gnd#succeedingPlaceOrGeographicName||25208
|-
||gnd#broaderTermPartitive||25111
|-
||gnd#dateOfEstablishment||24772
|-
||gnd#homepage||24439
|-
||gnd#precedingPlaceOrGeographicName||22490
|-
||gnd#dateOfTermination||21582
|-
||gnd#hierarchicalSuperiorOfPlaceOrGeographicName||20522
|-
||gnd#precedingConferenceOrEvent||17164
|-
||gnd#succeedingConferenceOrEvent||16963
|-
||gnd#dateOfProduction||15106
|-
||gnd#spatialAreaOfActivity||13593
|-
||gnd#hierarchicalSuperiorOfTheConferenceOrEvent||10126
|-
||gnd#architect||8206
|-
||gnd#topic||6088
|-
|-
! align="right"|   count(dateCount)
||gnd#relatedPlaceOrGeographicName||5812
|-
|-
| align="right"|             715987
||gnd#abbreviatedNameForTheConferenceOrEvent||5312
|-
|-
| align="right"|             23948
||gnd#complexSeeReferenceSubject||3927
|-
|-
| align="right"|               2815
||gnd#startingOrFinalPointOfADistance||3531
|-
|-
| align="right"|               141
||gnd#relatedConferenceOrEvent||3124
|-
|-
| align="right"|                 30
||gnd#relatedDdcWithDegreeOfDeterminacy4||2709
|-
|-
| align="right"|                 5
||gnd#relatedCorporateBody||2354
|-
|-
| align="right"|                 4
||gnd#relatedTerm||1728
|-
|-
| align="right"|                 3
||gnd#sponsorOrPatron||1281
|-
|-
| align="right"|                 1
||gnd#relatedDdcWithDegreeOfDeterminacy3||1200
|-
|-
| align="right"|                 1
||gnd#exhibitor||1044
|}
|}
}}
{{Fixme|todo=Unfortunately the headlines and the forms seem to be mixed up.|done=2020-06-25|by=--[[User:Wf|Wf]] ([[User talk:Wf|talk]]) 14:59, 25 June 2020 (CEST)}}
 
== events with most often used fields and seldom but useful fields ==
<source lang='sparql'>
# get events with most often used columns from GND
# plus acronym, topic, homepage (seldom but useful)
# WF 2020-07-12
PREFIX gndi:  <https://d-nb.info/gnd>
PREFIX gnd:  <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>
 
SELECT  ?event ?eventId ?acronym  ?variant ?name ?date ?areaCode ?place ?topic ?homepage
WHERE {
  ?event gnd:gndIdentifier ?eventId.
  OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?acronym. }
  OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?variant.}
  OPTIONAL { ?event gnd:preferredNameForTheConferenceOrEvent ?name.}
  OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?date. }
  OPTIONAL { ?event gnd:geographicAreaCode ?areaCode. }
  OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?place. }
  OPTIONAL { ?event gnd:topic ?topic. }
  OPTIONAL { ?event gnd:homepage ?homepage. }
}
#LIMIT 10000
</source>
 
== Script to setup a Jena instance with GND data extract ==
=== gnd2jena ===
<source lang='bash'>
#!/bin/bash
# WF 2020-05-10
 
# global settings
jena=apache-jena-4.4.0
tgz=$jena.tar.gz
#mirror=https://downloads.apache.org/jena/binaries
mirror=https://archive.apache.org/dist/jena/binaries
jenaurl=$mirror/$tgz
base=/hd/seel/gnd
#base=/hd/torterra/gnd
#base=/hd/luxio/gnd
data=$base/data
tdbloader=$jena/bin/tdb2.tdbloader
 
getjena() {
# download
if [ ! -f $tgz ]
then
  echo "downloading $tgz from $jenaurl"
  wget $jenaurl
else
  echo "$tgz already downloaded"
fi
# unpack
if [ ! -d $jena ]
then
  echo "unpacking $jena from $tgz"
  tar xvzf $tgz
else
  echo "$jena already unpacked"
fi
# create data directory
if [ ! -d $data ]
then
  echo "creating $data directory"
  mkdir -p $data
else
  echo "$data directory already created"
fi
}
 
#
# show the given timestamp
#
timestamp() {
local msg="$1"
local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
echo "$msg at $ts"
}
 
#
# load data for the given data dir and input
#
loaddata() {
  local data="$1"
  local input="$2"
  timestamp "start loading $input to $data"
  $tdbloader --loc "$data" "$input" > tdb2-$phase-out.log 2> tdb2-$phase-err.log
  timestamp "finished loading $input to $data"
}
 
getjena
export TMPDIR=$base/tmp
if [ ! -d $TMPDIR ]
then
  echo "creating temporary directory $TMPDIR"
  mkdir $TMPDIR
else
  echo "using temporary directory $TMPDIR"
fi
for d in kongress geografikum
do
  file=authorities-${d}_lds.ttl
  if [ ! -f $file ]
  then
    wget https://data.dnb.de/opendata/$file.gz
    gunzip $file.gz
  else
    echo "$file already downloaded"
  fi
  loaddata $data $file
done
</source>
 
== try on confident23 server ==
=== 2020-07-19 ===
<source lang='bash'>
wf@confident23:/usr/local/src$ sudo ./getjena
apache-jena-3.16.0.tar.gz already downloaded
apache-jena-3.16.0 already unpacked
creating /var/data/gnd/data directory
creating temporary directory /var/data/gnd/tmp
start loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-07-19T07:08:15Z
finished loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-07-19T07:11:16Z
</source>
=== 2020-09-15 ===
<source lang='bash'>
./getjena
apache-jena-3.16.0.tar.gz already downloaded
apache-jena-3.16.0 already unpacked
creating /var/data/gnd/data directory
creating temporary directory /var/data/gnd/tmp
--2020-09-15 11:46:37--  https://data.dnb.de/opendata/authorities-kongress_lds.ttl.gz
Resolving data.dnb.de (data.dnb.de)... 193.175.100.140
Connecting to data.dnb.de (data.dnb.de)|193.175.100.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80381713 (77M) [application/x-gzip]
Saving to: ‘authorities-kongress_lds.ttl.gz’
 
authorities-kongres 100%[===================>]  76.66M  66.3MB/s    in 1.2s   
 
2020-09-15 11:46:38 (66.3 MB/s) - ‘authorities-kongress_lds.ttl.gz’ saved [80381713/80381713]
 
start loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-09-15T09:46:42Z
finished loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-09-15T09:50:25Z
--2020-09-15 11:50:25--  https://data.dnb.de/opendata/authorities-geografikum_lds.ttl.gz
Resolving data.dnb.de (data.dnb.de)... 193.175.100.140
Connecting to data.dnb.de (data.dnb.de)|193.175.100.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33842092 (32M) [application/x-gzip]
Saving to: ‘authorities-geografikum_lds.ttl.gz’
 
authorities-geograf 100%[===================>]  32.27M  53.4MB/s    in 0.6s   
 
2020-09-15 11:50:26 (53.4 MB/s) - ‘authorities-geografikum_lds.ttl.gz’ saved [33842092/33842092]
 
start loading authorities-geografikum_lds.ttl to /var/data/gnd/data at 2020-09-15T09:50:27Z
finished loading authorities-geografikum_lds.ttl to /var/data/gnd/data at 2020-09-15T09:52:40Z
 
</source>
 
== start fuseki ==
<source lang='bash'>
#!/bin/bash
# WF 2020-06-25
# Jena Fuseki server installation
# see https://jena.apache.org/documentation/fuseki2/fuseki-run.html
version=3.16.0
fuseki=apache-jena-fuseki-$version
if [ ! -d $fuseki ]
then
if [ ! -f $fuseki.tar.gz ]
then
wget http://archive.apache.org/dist/jena/binaries/$fuseki.tar.gz
else
echo $fuseki.tar.gz already downloaded
fi
echo "unpacking $fuseki.tar.gz"
tar xvfz $fuseki.tar.gz
else
echo $fuseki already downloaded and unpacked
fi
cd $fuseki
gnddata=/var/data/gnd/data
java -jar fuseki-server.jar --tdb2 --loc=$gnddata /gnd
wf@confident23:/usr/local/src$
</source>

Revision as of 06:37, 5 August 2022

GND de

GlossaryEntry
edit
responsible  https://www.dnb.de/DE/Professionell/Standardisierung/GND/gnd.html
state  
since  2012
description  Gemeinsame Normdatei
references  
lang  de
master  GND

Wikibase

Report "GND meets Wikibase" 2 Barbara Fischer

Confluence

Links

GND property multiplicity

Property multiplicity
property gnd total unique min max avg
eventId gnd:gndIdentifier 731651 731651 1 1 1
title gnd:preferredNameForTheConferenceOrEvent 731645 731645 0 1 0.999991799
acronym gnd:abbreviatedNameForTheConferenceOrEvent 3537 3206 0 4 0.00483
sameAs owl:sameAs 769120 693077 0 20 1.05
variant gnd:variantNameForTheConferenceOrEvent 632368 229268 0 41 0.86
date gnd:dateOfConferenceOrEvent 710819 704949 0 9 0.971
areaCode gnd:geographicAreaCode 797037 612631 0 11 1.089
place gnd:placeOfConferenceOrEvent 659305 624667 0 18 0.901
topic gnd:topic 5061 3520 0 6 0.00691
homepage gnd:homepage 19011 18702 0 3 0.026
prec gnd:homepage 12182 12106 0 3 0.0166
succ gnd:homepage 11974 11929 0 3 0.0163

performance optimized query of GND event details

# performance optimized query of GND event details
# with aggregated properties as single, count and | separated list column
# WF 2021-12-05
PREFIX gndi:  <https://d-nb.info/gnd>
PREFIX gnd:  <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>

SELECT  
   ?event 
   ?eventId  
   (MIN(?eventTitle) as ?title)

   (COUNT (DISTINCT ?eventDate) as ?dateCount)
   (MIN(?eventDate) as ?date)

   (MIN(?eventAcronym) as ?acronym)
   (COUNT (DISTINCT ?eventAcronym) as ?acronymCount)
   (GROUP_CONCAT(DISTINCT ?eventAcronym; SEPARATOR="| ") AS ?acronyms)

   (MIN(?eventVariant) as ?variant)
   (COUNT (DISTINCT ?eventVariant) as ?variantCount)
   (GROUP_CONCAT(DISTINCT ?eventVariant; SEPARATOR="| ") AS ?variants) 

   (MIN(?eventPlace) as ?place)
   (COUNT (DISTINCT ?eventPlace) as ?placeCount)
   (GROUP_CONCAT(DISTINCT ?eventPlace; SEPARATOR="| ") AS ?places) 

   (MIN(?eventHomepage) as ?homepage)
WHERE {
  ?event a gnd:ConferenceOrEvent.
  ?event gnd:gndIdentifier ?eventId.
  ?event gnd:preferredNameForTheConferenceOrEvent ?eventTitle.
  OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?eventAcronym. }
  OPTIONAL { ?event gnd:homepage ?eventHomepage. }
  OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?eventVariant. }
  OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?eventDate. }
  OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?eventPlace }
  # only available 3520 times 2021-12
  # ?event gnd:topic ?topic.
  # only available 12106 times 2021-12
  # ?event gnd:precedingConferenceOrEvent ?prec
  # only available 11929 times 2021-12
  #?event gnd:succeedingConferenceOrEvent ?succ
}
GROUP BY ?event ?eventId

query to analyze multiplicity

# get aggregate counts of property usage 
# this query needs to be modified property by property and run twice
# once without the having clause and once with the having clause 
# to create the table further down in this wikipage
# WF 2021-12-05
PREFIX gndi:  <https://d-nb.info/gnd>
PREFIX gnd:  <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>

SELECT (sum (?itemCount) as ?sum) (min (?itemCount) as ?min) (max (?itemCount) as ?max) (avg (?itemCount) as ?avg)  {
  SELECT  ?event ?eventId (count(?title) as ?itemCount)
  WHERE {
    ?event a gnd:ConferenceOrEvent.
    ?event gnd:gndIdentifier ?eventId.
    OPTIONAL {
      # ?event gnd:abbreviatedNameForTheConferenceOrEvent ?acronym. 
      # ?event owl:sameAs ?sameAs.
      # ?event gnd:variantNameForTheConferenceOrEvent ?variant.
      ?event gnd:preferredNameForTheConferenceOrEvent ?title.
      # ?event gnd:dateOfConferenceOrEvent ?date
      # ?event gnd:geographicAreaCode ?areaCode.
      # ?event gnd:placeOfConferenceOrEvent ?place.
      # ?event gnd:topic ?topic.
      # ?event gnd:homepage ?homepage. 
      # ?event gnd:precedingConferenceOrEvent ?prec
      # ?event gnd:succeedingConferenceOrEvent ?succ
    }
  }
  GROUP BY ?event ?eventId
  #HAVING(COUNT(?title) = 1)
}

SPARQL Queries

entities and usage frequency

# get histogramm data of entities by
# usage frequency
# WF 2020-06-27
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>

SELECT ?c  (COUNT(?c) AS ?count)
WHERE {
  ?subject a  ?c
}
GROUP BY ?c
HAVING (?count >100)
ORDER BY DESC(?count)
c count
gnd#ConferenceOrEvent 713310
gnd#TerritorialCorporateBodyOrAdministrativeUnit 188246
gnd#SeriesOfConferenceOrEvent 122970
gnd#BuildingOrMemorial 67149
http://www.opengis.net/ont/sf#Point 57987
gnd#PlaceOrGeographicName 27771
gnd#NaturalGeographicUnit 20269
gnd#AdministrativeUnit 12846
gnd#WayBorderOrLine 4971
gnd#ReligiousTerritory 2646
gnd#NameOfSmallGeographicUnitLyingWithinAnotherGeographicUnit 2113
gnd#CorporateBody 559
gnd#MemberState 543
gnd#Country 307
gnd#ExtraterrestrialTerritory 282
gnd#Language 193
gnd#ReligiousCorporateBody 155
gnd#ReligiousAdministrativeUnit 134
gnd#HistoricSingleEventOrEra 128

relevance of fields

# get histogramm data of properties by
# usage frequency
# WF 2020-07-12
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>

SELECT ?property (COUNT(?property) AS ?propTotal)
WHERE { ?s ?property ?o . }
GROUP BY ?property
HAVING (?propTotal >1000)
ORDER BY DESC(?propTotal)
property propTotal
http://www.w3.org/2002/07/owl#sameAs 1359115
http://www.w3.org/1999/02/22-rdf-syntax-ns#type 1222895
gnd#geographicAreaCode 1195102
http://purl.org/dc/terms/license 1149468
http://purl.org/dc/terms/modified 1149468
http://www.w3.org/2007/05/powder-s#describedby 1149468
gnd#gndIdentifier 1149468
gnd#oldAuthorityNumber 940550
gnd#preferredNameForTheConferenceOrEvent 836397
gnd#variantNameForTheConferenceOrEvent 720355
gnd#dateOfConferenceOrEvent 693196
gnd#placeOfConferenceOrEvent 650165
gnd#preferredNameForThePlaceOrGeographicName 313058
gnd#variantNameForThePlaceOrGeographicName 298383
gnd#gndSubjectCategory 176395
gnd#definition 157695
gnd#broaderTermInstantial 117334
gnd#place 82345
http://xmlns.com/foaf/0.1/page 74542
http://www.opengis.net/ont/geosparql#asWKT 58083
http://www.opengis.net/ont/geosparql#hasGeometry 58083
https://d-nb.info/standards/elementset/dnb#deprecatedUri 51576
gnd#biographicalOrHistoricalInformation 34190
gnd#organizerOrHost 34167
gnd#relatedDdcWithDegreeOfDeterminacy2 25452
gnd#succeedingPlaceOrGeographicName 25208
gnd#broaderTermPartitive 25111
gnd#dateOfEstablishment 24772
gnd#homepage 24439
gnd#precedingPlaceOrGeographicName 22490
gnd#dateOfTermination 21582
gnd#hierarchicalSuperiorOfPlaceOrGeographicName 20522
gnd#precedingConferenceOrEvent 17164
gnd#succeedingConferenceOrEvent 16963
gnd#dateOfProduction 15106
gnd#spatialAreaOfActivity 13593
gnd#hierarchicalSuperiorOfTheConferenceOrEvent 10126
gnd#architect 8206
gnd#topic 6088
gnd#relatedPlaceOrGeographicName 5812
gnd#abbreviatedNameForTheConferenceOrEvent 5312
gnd#complexSeeReferenceSubject 3927
gnd#startingOrFinalPointOfADistance 3531
gnd#relatedConferenceOrEvent 3124
gnd#relatedDdcWithDegreeOfDeterminacy4 2709
gnd#relatedCorporateBody 2354
gnd#relatedTerm 1728
gnd#sponsorOrPatron 1281
gnd#relatedDdcWithDegreeOfDeterminacy3 1200
gnd#exhibitor 1044
done 2020-06-25"2020-06-25" is not recognized as a Boolean (true/false) value.
todo Unfortunately the headlines and the forms seem to be mixed up.


events with most often used fields and seldom but useful fields

# get events with most often used columns from GND
# plus acronym, topic, homepage (seldom but useful)
# WF 2020-07-12
PREFIX gndi:  <https://d-nb.info/gnd>
PREFIX gnd:  <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>

SELECT  ?event ?eventId ?acronym  ?variant ?name ?date ?areaCode ?place ?topic ?homepage 
WHERE {
  ?event gnd:gndIdentifier ?eventId.
  OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?acronym. }
  OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?variant.}
  OPTIONAL { ?event gnd:preferredNameForTheConferenceOrEvent ?name.}
  OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?date. }
  OPTIONAL { ?event gnd:geographicAreaCode ?areaCode. }
  OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?place. }
  OPTIONAL { ?event gnd:topic ?topic. }
  OPTIONAL { ?event gnd:homepage ?homepage. }
}
#LIMIT 10000

Script to setup a Jena instance with GND data extract

gnd2jena

#!/bin/bash
# WF 2020-05-10

# global settings
jena=apache-jena-4.4.0
tgz=$jena.tar.gz
#mirror=https://downloads.apache.org/jena/binaries
mirror=https://archive.apache.org/dist/jena/binaries
jenaurl=$mirror/$tgz
base=/hd/seel/gnd
#base=/hd/torterra/gnd
#base=/hd/luxio/gnd
data=$base/data
tdbloader=$jena/bin/tdb2.tdbloader

getjena() {
# download
if [ ! -f $tgz ]
then
  echo "downloading $tgz from $jenaurl"
  wget $jenaurl
else
  echo "$tgz already downloaded"
fi
# unpack
if [ ! -d $jena ]
then
  echo "unpacking $jena from $tgz"
  tar xvzf $tgz
else
  echo "$jena already unpacked"
fi
# create data directory
if [ ! -d $data ]
then
  echo "creating $data directory"
  mkdir -p $data
else
  echo "$data directory already created"
fi
}

#
# show the given timestamp
#
timestamp() {
 local msg="$1"
 local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
 echo "$msg at $ts"
}

#
# load data for the given data dir and input
#
loaddata() {
  local data="$1"
  local input="$2"
  timestamp "start loading $input to $data"
  $tdbloader --loc "$data" "$input" > tdb2-$phase-out.log 2> tdb2-$phase-err.log
  timestamp "finished loading $input to $data"
}

getjena
export TMPDIR=$base/tmp
if [ ! -d $TMPDIR ]
then
  echo "creating temporary directory $TMPDIR"
  mkdir $TMPDIR
else
  echo "using temporary directory $TMPDIR"
fi
for d in kongress geografikum
do
  file=authorities-${d}_lds.ttl
  if [ ! -f $file ]
  then
    wget https://data.dnb.de/opendata/$file.gz
    gunzip $file.gz
  else
    echo "$file already downloaded"
  fi
  loaddata $data $file 
done

try on confident23 server

2020-07-19

wf@confident23:/usr/local/src$ sudo ./getjena 
apache-jena-3.16.0.tar.gz already downloaded
apache-jena-3.16.0 already unpacked
creating /var/data/gnd/data directory
creating temporary directory /var/data/gnd/tmp
start loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-07-19T07:08:15Z
finished loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-07-19T07:11:16Z

2020-09-15

./getjena 
apache-jena-3.16.0.tar.gz already downloaded
apache-jena-3.16.0 already unpacked
creating /var/data/gnd/data directory
creating temporary directory /var/data/gnd/tmp
--2020-09-15 11:46:37--  https://data.dnb.de/opendata/authorities-kongress_lds.ttl.gz
Resolving data.dnb.de (data.dnb.de)... 193.175.100.140
Connecting to data.dnb.de (data.dnb.de)|193.175.100.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80381713 (77M) [application/x-gzip]
Saving to: ‘authorities-kongress_lds.ttl.gz’

authorities-kongres 100%[===================>]  76.66M  66.3MB/s    in 1.2s    

2020-09-15 11:46:38 (66.3 MB/s) - ‘authorities-kongress_lds.ttl.gz’ saved [80381713/80381713]

start loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-09-15T09:46:42Z
finished loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-09-15T09:50:25Z
--2020-09-15 11:50:25--  https://data.dnb.de/opendata/authorities-geografikum_lds.ttl.gz
Resolving data.dnb.de (data.dnb.de)... 193.175.100.140
Connecting to data.dnb.de (data.dnb.de)|193.175.100.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33842092 (32M) [application/x-gzip]
Saving to: ‘authorities-geografikum_lds.ttl.gz’

authorities-geograf 100%[===================>]  32.27M  53.4MB/s    in 0.6s    

2020-09-15 11:50:26 (53.4 MB/s) - ‘authorities-geografikum_lds.ttl.gz’ saved [33842092/33842092]

start loading authorities-geografikum_lds.ttl to /var/data/gnd/data at 2020-09-15T09:50:27Z
finished loading authorities-geografikum_lds.ttl to /var/data/gnd/data at 2020-09-15T09:52:40Z

start fuseki

#!/bin/bash
# WF 2020-06-25
# Jena Fuseki server installation
# see https://jena.apache.org/documentation/fuseki2/fuseki-run.html
version=3.16.0
fuseki=apache-jena-fuseki-$version
if [ ! -d $fuseki ]
then
 if [ ! -f $fuseki.tar.gz ]
 then
 wget http://archive.apache.org/dist/jena/binaries/$fuseki.tar.gz
 else
 echo $fuseki.tar.gz already downloaded
 fi
 echo "unpacking $fuseki.tar.gz"
 tar xvfz $fuseki.tar.gz
else
 echo $fuseki already downloaded and unpacked
fi
cd $fuseki
gnddata=/var/data/gnd/data
java -jar fuseki-server.jar --tdb2 --loc=$gnddata /gnd
wf@confident23:/usr/local/src$