Truly Tabular RDF/GND

From BITPlan Wiki
Revision as of 09:24, 5 August 2022 by Wf (talk | contribs) (→‎GND property multiplicity)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search

Truly Tabular RDF

GND de

GlossaryEntry
edit
responsible  https://www.dnb.de/DE/Professionell/Standardisierung/GND/gnd.html
state  
since  2012
description  Gemeinsame Normdatei
references  
lang  de
master  GND

Wikibase

Report "GND meets Wikibase" 2 Barbara Fischer

Confluence

Links

GND property multiplicity

Based on analysis of https://data.dnb.de/opendata/authorities-kongress_lds.ttl.gz as of 2021-12

Property multiplicity
property gnd total unique min max avg
eventId gnd:gndIdentifier 731651 731651 1 1 1
title gnd:preferredNameForTheConferenceOrEvent 731645 731645 0 1 0.999991799
acronym gnd:abbreviatedNameForTheConferenceOrEvent 3537 3206 0 4 0.00483
sameAs owl:sameAs 769120 693077 0 20 1.05
variant gnd:variantNameForTheConferenceOrEvent 632368 229268 0 41 0.86
date gnd:dateOfConferenceOrEvent 710819 704949 0 9 0.971
areaCode gnd:geographicAreaCode 797037 612631 0 11 1.089
place gnd:placeOfConferenceOrEvent 659305 624667 0 18 0.901
topic gnd:topic 5061 3520 0 6 0.00691
homepage gnd:homepage 19011 18702 0 3 0.026
prec gnd:homepage 12182 12106 0 3 0.0166
succ gnd:homepage 11974 11929 0 3 0.0163

performance optimized query of GND event details

# performance optimized query of GND event details
# with aggregated properties as single, count and | separated list column
# WF 2021-12-05
PREFIX gndi:  <https://d-nb.info/gnd>
PREFIX gnd:  <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>

SELECT  
   ?event 
   ?eventId  
   (MIN(?eventTitle) as ?title)

   (COUNT (DISTINCT ?eventDate) as ?dateCount)
   (MIN(?eventDate) as ?date)

   (MIN(?eventAcronym) as ?acronym)
   (COUNT (DISTINCT ?eventAcronym) as ?acronymCount)
   (GROUP_CONCAT(DISTINCT ?eventAcronym; SEPARATOR="| ") AS ?acronyms)

   (MIN(?eventVariant) as ?variant)
   (COUNT (DISTINCT ?eventVariant) as ?variantCount)
   (GROUP_CONCAT(DISTINCT ?eventVariant; SEPARATOR="| ") AS ?variants) 

   (MIN(?eventPlace) as ?place)
   (COUNT (DISTINCT ?eventPlace) as ?placeCount)
   (GROUP_CONCAT(DISTINCT ?eventPlace; SEPARATOR="| ") AS ?places) 

   (MIN(?eventHomepage) as ?homepage)
WHERE {
  ?event a gnd:ConferenceOrEvent.
  ?event gnd:gndIdentifier ?eventId.
  ?event gnd:preferredNameForTheConferenceOrEvent ?eventTitle.
  OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?eventAcronym. }
  OPTIONAL { ?event gnd:homepage ?eventHomepage. }
  OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?eventVariant. }
  OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?eventDate. }
  OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?eventPlace }
  # only available 3520 times 2021-12
  # ?event gnd:topic ?topic.
  # only available 12106 times 2021-12
  # ?event gnd:precedingConferenceOrEvent ?prec
  # only available 11929 times 2021-12
  #?event gnd:succeedingConferenceOrEvent ?succ
}
GROUP BY ?event ?eventId

query to analyze multiplicity

# get aggregate counts of property usage 
# this query needs to be modified property by property and run twice
# once without the having clause and once with the having clause 
# to create the table further down in this wikipage
# WF 2021-12-05
PREFIX gndi:  <https://d-nb.info/gnd>
PREFIX gnd:  <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>

SELECT (sum (?itemCount) as ?sum) (min (?itemCount) as ?min) (max (?itemCount) as ?max) (avg (?itemCount) as ?avg)  {
  SELECT  ?event ?eventId (count(?title) as ?itemCount)
  WHERE {
    ?event a gnd:ConferenceOrEvent.
    ?event gnd:gndIdentifier ?eventId.
    OPTIONAL {
      # ?event gnd:abbreviatedNameForTheConferenceOrEvent ?acronym. 
      # ?event owl:sameAs ?sameAs.
      # ?event gnd:variantNameForTheConferenceOrEvent ?variant.
      ?event gnd:preferredNameForTheConferenceOrEvent ?title.
      # ?event gnd:dateOfConferenceOrEvent ?date
      # ?event gnd:geographicAreaCode ?areaCode.
      # ?event gnd:placeOfConferenceOrEvent ?place.
      # ?event gnd:topic ?topic.
      # ?event gnd:homepage ?homepage. 
      # ?event gnd:precedingConferenceOrEvent ?prec
      # ?event gnd:succeedingConferenceOrEvent ?succ
    }
  }
  GROUP BY ?event ?eventId
  #HAVING(COUNT(?title) = 1)
}

SPARQL Queries

entities and usage frequency

# get histogramm data of entities by
# usage frequency
# WF 2020-06-27
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>

SELECT ?c  (COUNT(?c) AS ?count)
WHERE {
  ?subject a  ?c
}
GROUP BY ?c
HAVING (?count >100)
ORDER BY DESC(?count)
c count
gnd#ConferenceOrEvent 713310
gnd#TerritorialCorporateBodyOrAdministrativeUnit 188246
gnd#SeriesOfConferenceOrEvent 122970
gnd#BuildingOrMemorial 67149
http://www.opengis.net/ont/sf#Point 57987
gnd#PlaceOrGeographicName 27771
gnd#NaturalGeographicUnit 20269
gnd#AdministrativeUnit 12846
gnd#WayBorderOrLine 4971
gnd#ReligiousTerritory 2646
gnd#NameOfSmallGeographicUnitLyingWithinAnotherGeographicUnit 2113
gnd#CorporateBody 559
gnd#MemberState 543
gnd#Country 307
gnd#ExtraterrestrialTerritory 282
gnd#Language 193
gnd#ReligiousCorporateBody 155
gnd#ReligiousAdministrativeUnit 134
gnd#HistoricSingleEventOrEra 128

relevance of fields

# get histogramm data of properties by
# usage frequency
# WF 2020-07-12
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>

SELECT ?property (COUNT(?property) AS ?propTotal)
WHERE { ?s ?property ?o . }
GROUP BY ?property
HAVING (?propTotal >1000)
ORDER BY DESC(?propTotal)
property propTotal
http://www.w3.org/2002/07/owl#sameAs 1359115
http://www.w3.org/1999/02/22-rdf-syntax-ns#type 1222895
gnd#geographicAreaCode 1195102
http://purl.org/dc/terms/license 1149468
http://purl.org/dc/terms/modified 1149468
http://www.w3.org/2007/05/powder-s#describedby 1149468
gnd#gndIdentifier 1149468
gnd#oldAuthorityNumber 940550
gnd#preferredNameForTheConferenceOrEvent 836397
gnd#variantNameForTheConferenceOrEvent 720355
gnd#dateOfConferenceOrEvent 693196
gnd#placeOfConferenceOrEvent 650165
gnd#preferredNameForThePlaceOrGeographicName 313058
gnd#variantNameForThePlaceOrGeographicName 298383
gnd#gndSubjectCategory 176395
gnd#definition 157695
gnd#broaderTermInstantial 117334
gnd#place 82345
http://xmlns.com/foaf/0.1/page 74542
http://www.opengis.net/ont/geosparql#asWKT 58083
http://www.opengis.net/ont/geosparql#hasGeometry 58083
https://d-nb.info/standards/elementset/dnb#deprecatedUri 51576
gnd#biographicalOrHistoricalInformation 34190
gnd#organizerOrHost 34167
gnd#relatedDdcWithDegreeOfDeterminacy2 25452
gnd#succeedingPlaceOrGeographicName 25208
gnd#broaderTermPartitive 25111
gnd#dateOfEstablishment 24772
gnd#homepage 24439
gnd#precedingPlaceOrGeographicName 22490
gnd#dateOfTermination 21582
gnd#hierarchicalSuperiorOfPlaceOrGeographicName 20522
gnd#precedingConferenceOrEvent 17164
gnd#succeedingConferenceOrEvent 16963
gnd#dateOfProduction 15106
gnd#spatialAreaOfActivity 13593
gnd#hierarchicalSuperiorOfTheConferenceOrEvent 10126
gnd#architect 8206
gnd#topic 6088
gnd#relatedPlaceOrGeographicName 5812
gnd#abbreviatedNameForTheConferenceOrEvent 5312
gnd#complexSeeReferenceSubject 3927
gnd#startingOrFinalPointOfADistance 3531
gnd#relatedConferenceOrEvent 3124
gnd#relatedDdcWithDegreeOfDeterminacy4 2709
gnd#relatedCorporateBody 2354
gnd#relatedTerm 1728
gnd#sponsorOrPatron 1281
gnd#relatedDdcWithDegreeOfDeterminacy3 1200
gnd#exhibitor 1044
done 2020-06-25"2020-06-25" is not recognized as a Boolean (true/false) value.
todo Unfortunately the headlines and the forms seem to be mixed up.


events with most often used fields and seldom but useful fields

# get events with most often used columns from GND
# plus acronym, topic, homepage (seldom but useful)
# WF 2020-07-12
PREFIX gndi:  <https://d-nb.info/gnd>
PREFIX gnd:  <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>

SELECT  ?event ?eventId ?acronym  ?variant ?name ?date ?areaCode ?place ?topic ?homepage 
WHERE {
  ?event gnd:gndIdentifier ?eventId.
  OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?acronym. }
  OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?variant.}
  OPTIONAL { ?event gnd:preferredNameForTheConferenceOrEvent ?name.}
  OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?date. }
  OPTIONAL { ?event gnd:geographicAreaCode ?areaCode. }
  OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?place. }
  OPTIONAL { ?event gnd:topic ?topic. }
  OPTIONAL { ?event gnd:homepage ?homepage. }
}
#LIMIT 10000

Script to setup a Jena instance with GND data extract

gnd2jena

#!/bin/bash
# WF 2020-05-10

# global settings
jena=apache-jena-4.4.0
tgz=$jena.tar.gz
#mirror=https://downloads.apache.org/jena/binaries
mirror=https://archive.apache.org/dist/jena/binaries
jenaurl=$mirror/$tgz
base=/hd/seel/gnd
#base=/hd/torterra/gnd
#base=/hd/luxio/gnd
data=$base/data
tdbloader=$jena/bin/tdb2.tdbloader

getjena() {
# download
if [ ! -f $tgz ]
then
  echo "downloading $tgz from $jenaurl"
  wget $jenaurl
else
  echo "$tgz already downloaded"
fi
# unpack
if [ ! -d $jena ]
then
  echo "unpacking $jena from $tgz"
  tar xvzf $tgz
else
  echo "$jena already unpacked"
fi
# create data directory
if [ ! -d $data ]
then
  echo "creating $data directory"
  mkdir -p $data
else
  echo "$data directory already created"
fi
}

#
# show the given timestamp
#
timestamp() {
 local msg="$1"
 local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
 echo "$msg at $ts"
}

#
# load data for the given data dir and input
#
loaddata() {
  local data="$1"
  local input="$2"
  timestamp "start loading $input to $data"
  $tdbloader --loc "$data" "$input" > tdb2-$phase-out.log 2> tdb2-$phase-err.log
  timestamp "finished loading $input to $data"
}

getjena
export TMPDIR=$base/tmp
if [ ! -d $TMPDIR ]
then
  echo "creating temporary directory $TMPDIR"
  mkdir $TMPDIR
else
  echo "using temporary directory $TMPDIR"
fi
for d in kongress geografikum
do
  file=authorities-${d}_lds.ttl
  if [ ! -f $file ]
  then
    wget https://data.dnb.de/opendata/$file.gz
    gunzip $file.gz
  else
    echo "$file already downloaded"
  fi
  loaddata $data $file 
done

try on confident23 server

2020-07-19

wf@confident23:/usr/local/src$ sudo ./getjena 
apache-jena-3.16.0.tar.gz already downloaded
apache-jena-3.16.0 already unpacked
creating /var/data/gnd/data directory
creating temporary directory /var/data/gnd/tmp
start loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-07-19T07:08:15Z
finished loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-07-19T07:11:16Z

2020-09-15

./getjena 
apache-jena-3.16.0.tar.gz already downloaded
apache-jena-3.16.0 already unpacked
creating /var/data/gnd/data directory
creating temporary directory /var/data/gnd/tmp
--2020-09-15 11:46:37--  https://data.dnb.de/opendata/authorities-kongress_lds.ttl.gz
Resolving data.dnb.de (data.dnb.de)... 193.175.100.140
Connecting to data.dnb.de (data.dnb.de)|193.175.100.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80381713 (77M) [application/x-gzip]
Saving to: ‘authorities-kongress_lds.ttl.gz’

authorities-kongres 100%[===================>]  76.66M  66.3MB/s    in 1.2s    

2020-09-15 11:46:38 (66.3 MB/s) - ‘authorities-kongress_lds.ttl.gz’ saved [80381713/80381713]

start loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-09-15T09:46:42Z
finished loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-09-15T09:50:25Z
--2020-09-15 11:50:25--  https://data.dnb.de/opendata/authorities-geografikum_lds.ttl.gz
Resolving data.dnb.de (data.dnb.de)... 193.175.100.140
Connecting to data.dnb.de (data.dnb.de)|193.175.100.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33842092 (32M) [application/x-gzip]
Saving to: ‘authorities-geografikum_lds.ttl.gz’

authorities-geograf 100%[===================>]  32.27M  53.4MB/s    in 0.6s    

2020-09-15 11:50:26 (53.4 MB/s) - ‘authorities-geografikum_lds.ttl.gz’ saved [33842092/33842092]

start loading authorities-geografikum_lds.ttl to /var/data/gnd/data at 2020-09-15T09:50:27Z
finished loading authorities-geografikum_lds.ttl to /var/data/gnd/data at 2020-09-15T09:52:40Z

start fuseki

#!/bin/bash
# WF 2020-06-25
# Jena Fuseki server installation
# see https://jena.apache.org/documentation/fuseki2/fuseki-run.html
version=3.16.0
fuseki=apache-jena-fuseki-$version
if [ ! -d $fuseki ]
then
 if [ ! -f $fuseki.tar.gz ]
 then
 wget http://archive.apache.org/dist/jena/binaries/$fuseki.tar.gz
 else
 echo $fuseki.tar.gz already downloaded
 fi
 echo "unpacking $fuseki.tar.gz"
 tar xvfz $fuseki.tar.gz
else
 echo $fuseki already downloaded and unpacked
fi
cd $fuseki
gnddata=/var/data/gnd/data
java -jar fuseki-server.jar --tdb2 --loc=$gnddata /gnd
wf@confident23:/usr/local/src$