Difference between revisions of "Truly Tabular RDF/GND"
Jump to navigation
Jump to search
Line 21: | Line 21: | ||
* https://blog.lobid.org/2018/08/27/openrefine.html | * https://blog.lobid.org/2018/08/27/openrefine.html | ||
== GND property multiplicity == | == GND property multiplicity == | ||
+ | Based on analysis of https://data.dnb.de/opendata/authorities-kongress_lds.ttl.gz as of 2021-12 | ||
{| class="wikitable" | {| class="wikitable" | ||
|+ Property multiplicity | |+ Property multiplicity |
Latest revision as of 09:24, 5 August 2022
GlossaryEntry | |
---|---|
edit | |
responsible | https://www.dnb.de/DE/Professionell/Standardisierung/GND/gnd.html |
state | |
since | 2012 |
description | Gemeinsame Normdatei |
references | |
lang | de |
master | GND |
Wikibase
Report "GND meets Wikibase" 2 Barbara Fischer
Links
- https://d-nb.info/standards/elementset/gnd
- https://d-nb.info/standards/elementset/gnd#SeriesOfConferenceOrEvent
- https://data.dnb.de/opendata/
- https://blog.lobid.org/2018/08/27/openrefine.html
GND property multiplicity
Based on analysis of https://data.dnb.de/opendata/authorities-kongress_lds.ttl.gz as of 2021-12
property | gnd | total | unique | min | max | avg |
---|---|---|---|---|---|---|
eventId | gnd:gndIdentifier | 731651 | 731651 | 1 | 1 | 1 |
title | gnd:preferredNameForTheConferenceOrEvent | 731645 | 731645 | 0 | 1 | 0.999991799 |
acronym | gnd:abbreviatedNameForTheConferenceOrEvent | 3537 | 3206 | 0 | 4 | 0.00483 |
sameAs | owl:sameAs | 769120 | 693077 | 0 | 20 | 1.05 |
variant | gnd:variantNameForTheConferenceOrEvent | 632368 | 229268 | 0 | 41 | 0.86 |
date | gnd:dateOfConferenceOrEvent | 710819 | 704949 | 0 | 9 | 0.971 |
areaCode | gnd:geographicAreaCode | 797037 | 612631 | 0 | 11 | 1.089 |
place | gnd:placeOfConferenceOrEvent | 659305 | 624667 | 0 | 18 | 0.901 |
topic | gnd:topic | 5061 | 3520 | 0 | 6 | 0.00691 |
homepage | gnd:homepage | 19011 | 18702 | 0 | 3 | 0.026 |
prec | gnd:homepage | 12182 | 12106 | 0 | 3 | 0.0166 |
succ | gnd:homepage | 11974 | 11929 | 0 | 3 | 0.0163 |
performance optimized query of GND event details
# performance optimized query of GND event details
# with aggregated properties as single, count and | separated list column
# WF 2021-12-05
PREFIX gndi: <https://d-nb.info/gnd>
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>
SELECT
?event
?eventId
(MIN(?eventTitle) as ?title)
(COUNT (DISTINCT ?eventDate) as ?dateCount)
(MIN(?eventDate) as ?date)
(MIN(?eventAcronym) as ?acronym)
(COUNT (DISTINCT ?eventAcronym) as ?acronymCount)
(GROUP_CONCAT(DISTINCT ?eventAcronym; SEPARATOR="| ") AS ?acronyms)
(MIN(?eventVariant) as ?variant)
(COUNT (DISTINCT ?eventVariant) as ?variantCount)
(GROUP_CONCAT(DISTINCT ?eventVariant; SEPARATOR="| ") AS ?variants)
(MIN(?eventPlace) as ?place)
(COUNT (DISTINCT ?eventPlace) as ?placeCount)
(GROUP_CONCAT(DISTINCT ?eventPlace; SEPARATOR="| ") AS ?places)
(MIN(?eventHomepage) as ?homepage)
WHERE {
?event a gnd:ConferenceOrEvent.
?event gnd:gndIdentifier ?eventId.
?event gnd:preferredNameForTheConferenceOrEvent ?eventTitle.
OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?eventAcronym. }
OPTIONAL { ?event gnd:homepage ?eventHomepage. }
OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?eventVariant. }
OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?eventDate. }
OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?eventPlace }
# only available 3520 times 2021-12
# ?event gnd:topic ?topic.
# only available 12106 times 2021-12
# ?event gnd:precedingConferenceOrEvent ?prec
# only available 11929 times 2021-12
#?event gnd:succeedingConferenceOrEvent ?succ
}
GROUP BY ?event ?eventId
query to analyze multiplicity
# get aggregate counts of property usage
# this query needs to be modified property by property and run twice
# once without the having clause and once with the having clause
# to create the table further down in this wikipage
# WF 2021-12-05
PREFIX gndi: <https://d-nb.info/gnd>
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>
SELECT (sum (?itemCount) as ?sum) (min (?itemCount) as ?min) (max (?itemCount) as ?max) (avg (?itemCount) as ?avg) {
SELECT ?event ?eventId (count(?title) as ?itemCount)
WHERE {
?event a gnd:ConferenceOrEvent.
?event gnd:gndIdentifier ?eventId.
OPTIONAL {
# ?event gnd:abbreviatedNameForTheConferenceOrEvent ?acronym.
# ?event owl:sameAs ?sameAs.
# ?event gnd:variantNameForTheConferenceOrEvent ?variant.
?event gnd:preferredNameForTheConferenceOrEvent ?title.
# ?event gnd:dateOfConferenceOrEvent ?date
# ?event gnd:geographicAreaCode ?areaCode.
# ?event gnd:placeOfConferenceOrEvent ?place.
# ?event gnd:topic ?topic.
# ?event gnd:homepage ?homepage.
# ?event gnd:precedingConferenceOrEvent ?prec
# ?event gnd:succeedingConferenceOrEvent ?succ
}
}
GROUP BY ?event ?eventId
#HAVING(COUNT(?title) = 1)
}
SPARQL Queries
entities and usage frequency
# get histogramm data of entities by
# usage frequency
# WF 2020-06-27
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
SELECT ?c (COUNT(?c) AS ?count)
WHERE {
?subject a ?c
}
GROUP BY ?c
HAVING (?count >100)
ORDER BY DESC(?count)
c | count |
---|---|
gnd#ConferenceOrEvent | 713310 |
gnd#TerritorialCorporateBodyOrAdministrativeUnit | 188246 |
gnd#SeriesOfConferenceOrEvent | 122970 |
gnd#BuildingOrMemorial | 67149 |
http://www.opengis.net/ont/sf#Point | 57987 |
gnd#PlaceOrGeographicName | 27771 |
gnd#NaturalGeographicUnit | 20269 |
gnd#AdministrativeUnit | 12846 |
gnd#WayBorderOrLine | 4971 |
gnd#ReligiousTerritory | 2646 |
gnd#NameOfSmallGeographicUnitLyingWithinAnotherGeographicUnit | 2113 |
gnd#CorporateBody | 559 |
gnd#MemberState | 543 |
gnd#Country | 307 |
gnd#ExtraterrestrialTerritory | 282 |
gnd#Language | 193 |
gnd#ReligiousCorporateBody | 155 |
gnd#ReligiousAdministrativeUnit | 134 |
gnd#HistoricSingleEventOrEra | 128 |
relevance of fields
# get histogramm data of properties by
# usage frequency
# WF 2020-07-12
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>
SELECT ?property (COUNT(?property) AS ?propTotal)
WHERE { ?s ?property ?o . }
GROUP BY ?property
HAVING (?propTotal >1000)
ORDER BY DESC(?propTotal)
property | propTotal |
---|---|
http://www.w3.org/2002/07/owl#sameAs | 1359115 |
http://www.w3.org/1999/02/22-rdf-syntax-ns#type | 1222895 |
gnd#geographicAreaCode | 1195102 |
http://purl.org/dc/terms/license | 1149468 |
http://purl.org/dc/terms/modified | 1149468 |
http://www.w3.org/2007/05/powder-s#describedby | 1149468 |
gnd#gndIdentifier | 1149468 |
gnd#oldAuthorityNumber | 940550 |
gnd#preferredNameForTheConferenceOrEvent | 836397 |
gnd#variantNameForTheConferenceOrEvent | 720355 |
gnd#dateOfConferenceOrEvent | 693196 |
gnd#placeOfConferenceOrEvent | 650165 |
gnd#preferredNameForThePlaceOrGeographicName | 313058 |
gnd#variantNameForThePlaceOrGeographicName | 298383 |
gnd#gndSubjectCategory | 176395 |
gnd#definition | 157695 |
gnd#broaderTermInstantial | 117334 |
gnd#place | 82345 |
http://xmlns.com/foaf/0.1/page | 74542 |
http://www.opengis.net/ont/geosparql#asWKT | 58083 |
http://www.opengis.net/ont/geosparql#hasGeometry | 58083 |
https://d-nb.info/standards/elementset/dnb#deprecatedUri | 51576 |
gnd#biographicalOrHistoricalInformation | 34190 |
gnd#organizerOrHost | 34167 |
gnd#relatedDdcWithDegreeOfDeterminacy2 | 25452 |
gnd#succeedingPlaceOrGeographicName | 25208 |
gnd#broaderTermPartitive | 25111 |
gnd#dateOfEstablishment | 24772 |
gnd#homepage | 24439 |
gnd#precedingPlaceOrGeographicName | 22490 |
gnd#dateOfTermination | 21582 |
gnd#hierarchicalSuperiorOfPlaceOrGeographicName | 20522 |
gnd#precedingConferenceOrEvent | 17164 |
gnd#succeedingConferenceOrEvent | 16963 |
gnd#dateOfProduction | 15106 |
gnd#spatialAreaOfActivity | 13593 |
gnd#hierarchicalSuperiorOfTheConferenceOrEvent | 10126 |
gnd#architect | 8206 |
gnd#topic | 6088 |
gnd#relatedPlaceOrGeographicName | 5812 |
gnd#abbreviatedNameForTheConferenceOrEvent | 5312 |
gnd#complexSeeReferenceSubject | 3927 |
gnd#startingOrFinalPointOfADistance | 3531 |
gnd#relatedConferenceOrEvent | 3124 |
gnd#relatedDdcWithDegreeOfDeterminacy4 | 2709 |
gnd#relatedCorporateBody | 2354 |
gnd#relatedTerm | 1728 |
gnd#sponsorOrPatron | 1281 |
gnd#relatedDdcWithDegreeOfDeterminacy3 | 1200 |
gnd#exhibitor | 1044 |
done | 2020-06-25"2020-06-25" is not recognized as a Boolean (true/false) value. |
---|---|
todo | Unfortunately the headlines and the forms seem to be mixed up. |
events with most often used fields and seldom but useful fields
# get events with most often used columns from GND
# plus acronym, topic, homepage (seldom but useful)
# WF 2020-07-12
PREFIX gndi: <https://d-nb.info/gnd>
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>
SELECT ?event ?eventId ?acronym ?variant ?name ?date ?areaCode ?place ?topic ?homepage
WHERE {
?event gnd:gndIdentifier ?eventId.
OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?acronym. }
OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?variant.}
OPTIONAL { ?event gnd:preferredNameForTheConferenceOrEvent ?name.}
OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?date. }
OPTIONAL { ?event gnd:geographicAreaCode ?areaCode. }
OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?place. }
OPTIONAL { ?event gnd:topic ?topic. }
OPTIONAL { ?event gnd:homepage ?homepage. }
}
#LIMIT 10000
Script to setup a Jena instance with GND data extract
gnd2jena
#!/bin/bash
# WF 2020-05-10
# global settings
jena=apache-jena-4.4.0
tgz=$jena.tar.gz
#mirror=https://downloads.apache.org/jena/binaries
mirror=https://archive.apache.org/dist/jena/binaries
jenaurl=$mirror/$tgz
base=/hd/seel/gnd
#base=/hd/torterra/gnd
#base=/hd/luxio/gnd
data=$base/data
tdbloader=$jena/bin/tdb2.tdbloader
getjena() {
# download
if [ ! -f $tgz ]
then
echo "downloading $tgz from $jenaurl"
wget $jenaurl
else
echo "$tgz already downloaded"
fi
# unpack
if [ ! -d $jena ]
then
echo "unpacking $jena from $tgz"
tar xvzf $tgz
else
echo "$jena already unpacked"
fi
# create data directory
if [ ! -d $data ]
then
echo "creating $data directory"
mkdir -p $data
else
echo "$data directory already created"
fi
}
#
# show the given timestamp
#
timestamp() {
local msg="$1"
local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
echo "$msg at $ts"
}
#
# load data for the given data dir and input
#
loaddata() {
local data="$1"
local input="$2"
timestamp "start loading $input to $data"
$tdbloader --loc "$data" "$input" > tdb2-$phase-out.log 2> tdb2-$phase-err.log
timestamp "finished loading $input to $data"
}
getjena
export TMPDIR=$base/tmp
if [ ! -d $TMPDIR ]
then
echo "creating temporary directory $TMPDIR"
mkdir $TMPDIR
else
echo "using temporary directory $TMPDIR"
fi
for d in kongress geografikum
do
file=authorities-${d}_lds.ttl
if [ ! -f $file ]
then
wget https://data.dnb.de/opendata/$file.gz
gunzip $file.gz
else
echo "$file already downloaded"
fi
loaddata $data $file
done
try on confident23 server
2020-07-19
wf@confident23:/usr/local/src$ sudo ./getjena
apache-jena-3.16.0.tar.gz already downloaded
apache-jena-3.16.0 already unpacked
creating /var/data/gnd/data directory
creating temporary directory /var/data/gnd/tmp
start loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-07-19T07:08:15Z
finished loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-07-19T07:11:16Z
2020-09-15
./getjena
apache-jena-3.16.0.tar.gz already downloaded
apache-jena-3.16.0 already unpacked
creating /var/data/gnd/data directory
creating temporary directory /var/data/gnd/tmp
--2020-09-15 11:46:37-- https://data.dnb.de/opendata/authorities-kongress_lds.ttl.gz
Resolving data.dnb.de (data.dnb.de)... 193.175.100.140
Connecting to data.dnb.de (data.dnb.de)|193.175.100.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80381713 (77M) [application/x-gzip]
Saving to: ‘authorities-kongress_lds.ttl.gz’
authorities-kongres 100%[===================>] 76.66M 66.3MB/s in 1.2s
2020-09-15 11:46:38 (66.3 MB/s) - ‘authorities-kongress_lds.ttl.gz’ saved [80381713/80381713]
start loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-09-15T09:46:42Z
finished loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-09-15T09:50:25Z
--2020-09-15 11:50:25-- https://data.dnb.de/opendata/authorities-geografikum_lds.ttl.gz
Resolving data.dnb.de (data.dnb.de)... 193.175.100.140
Connecting to data.dnb.de (data.dnb.de)|193.175.100.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33842092 (32M) [application/x-gzip]
Saving to: ‘authorities-geografikum_lds.ttl.gz’
authorities-geograf 100%[===================>] 32.27M 53.4MB/s in 0.6s
2020-09-15 11:50:26 (53.4 MB/s) - ‘authorities-geografikum_lds.ttl.gz’ saved [33842092/33842092]
start loading authorities-geografikum_lds.ttl to /var/data/gnd/data at 2020-09-15T09:50:27Z
finished loading authorities-geografikum_lds.ttl to /var/data/gnd/data at 2020-09-15T09:52:40Z
start fuseki
#!/bin/bash
# WF 2020-06-25
# Jena Fuseki server installation
# see https://jena.apache.org/documentation/fuseki2/fuseki-run.html
version=3.16.0
fuseki=apache-jena-fuseki-$version
if [ ! -d $fuseki ]
then
if [ ! -f $fuseki.tar.gz ]
then
wget http://archive.apache.org/dist/jena/binaries/$fuseki.tar.gz
else
echo $fuseki.tar.gz already downloaded
fi
echo "unpacking $fuseki.tar.gz"
tar xvfz $fuseki.tar.gz
else
echo $fuseki already downloaded and unpacked
fi
cd $fuseki
gnddata=/var/data/gnd/data
java -jar fuseki-server.jar --tdb2 --loc=$gnddata /gnd
wf@confident23:/usr/local/src$