Truly Tabular RDF
Wikibase
Report "GND meets Wikibase" 2 Barbara Fischer
Confluence
Links
GND property multiplicity
Based on analysis of https://data.dnb.de/opendata/authorities-kongress_lds.ttl.gz as of 2021-12
Property multiplicity
property |
gnd |
total |
unique |
min |
max |
avg
|
eventId
|
gnd:gndIdentifier
|
731651
|
731651
|
1
|
1
|
1
|
title
|
gnd:preferredNameForTheConferenceOrEvent
|
731645
|
731645
|
0
|
1
|
0.999991799
|
acronym
|
gnd:abbreviatedNameForTheConferenceOrEvent
|
3537
|
3206
|
0
|
4
|
0.00483
|
sameAs
|
owl:sameAs
|
769120
|
693077
|
0
|
20
|
1.05
|
variant
|
gnd:variantNameForTheConferenceOrEvent
|
632368
|
229268
|
0
|
41
|
0.86
|
date
|
gnd:dateOfConferenceOrEvent
|
710819
|
704949
|
0
|
9
|
0.971
|
areaCode
|
gnd:geographicAreaCode
|
797037
|
612631
|
0
|
11
|
1.089
|
place
|
gnd:placeOfConferenceOrEvent
|
659305
|
624667
|
0
|
18
|
0.901
|
topic
|
gnd:topic
|
5061
|
3520
|
0
|
6
|
0.00691
|
homepage
|
gnd:homepage
|
19011
|
18702
|
0
|
3
|
0.026
|
prec
|
gnd:homepage
|
12182
|
12106
|
0
|
3
|
0.0166
|
succ
|
gnd:homepage
|
11974
|
11929
|
0
|
3
|
0.0163
|
performance optimized query of GND event details
# performance optimized query of GND event details
# with aggregated properties as single, count and | separated list column
# WF 2021-12-05
PREFIX gndi: <https://d-nb.info/gnd>
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>
SELECT
?event
?eventId
(MIN(?eventTitle) as ?title)
(COUNT (DISTINCT ?eventDate) as ?dateCount)
(MIN(?eventDate) as ?date)
(MIN(?eventAcronym) as ?acronym)
(COUNT (DISTINCT ?eventAcronym) as ?acronymCount)
(GROUP_CONCAT(DISTINCT ?eventAcronym; SEPARATOR="| ") AS ?acronyms)
(MIN(?eventVariant) as ?variant)
(COUNT (DISTINCT ?eventVariant) as ?variantCount)
(GROUP_CONCAT(DISTINCT ?eventVariant; SEPARATOR="| ") AS ?variants)
(MIN(?eventPlace) as ?place)
(COUNT (DISTINCT ?eventPlace) as ?placeCount)
(GROUP_CONCAT(DISTINCT ?eventPlace; SEPARATOR="| ") AS ?places)
(MIN(?eventHomepage) as ?homepage)
WHERE {
?event a gnd:ConferenceOrEvent.
?event gnd:gndIdentifier ?eventId.
?event gnd:preferredNameForTheConferenceOrEvent ?eventTitle.
OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?eventAcronym. }
OPTIONAL { ?event gnd:homepage ?eventHomepage. }
OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?eventVariant. }
OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?eventDate. }
OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?eventPlace }
# only available 3520 times 2021-12
# ?event gnd:topic ?topic.
# only available 12106 times 2021-12
# ?event gnd:precedingConferenceOrEvent ?prec
# only available 11929 times 2021-12
#?event gnd:succeedingConferenceOrEvent ?succ
}
GROUP BY ?event ?eventId
query to analyze multiplicity
# get aggregate counts of property usage
# this query needs to be modified property by property and run twice
# once without the having clause and once with the having clause
# to create the table further down in this wikipage
# WF 2021-12-05
PREFIX gndi: <https://d-nb.info/gnd>
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>
SELECT (sum (?itemCount) as ?sum) (min (?itemCount) as ?min) (max (?itemCount) as ?max) (avg (?itemCount) as ?avg) {
SELECT ?event ?eventId (count(?title) as ?itemCount)
WHERE {
?event a gnd:ConferenceOrEvent.
?event gnd:gndIdentifier ?eventId.
OPTIONAL {
# ?event gnd:abbreviatedNameForTheConferenceOrEvent ?acronym.
# ?event owl:sameAs ?sameAs.
# ?event gnd:variantNameForTheConferenceOrEvent ?variant.
?event gnd:preferredNameForTheConferenceOrEvent ?title.
# ?event gnd:dateOfConferenceOrEvent ?date
# ?event gnd:geographicAreaCode ?areaCode.
# ?event gnd:placeOfConferenceOrEvent ?place.
# ?event gnd:topic ?topic.
# ?event gnd:homepage ?homepage.
# ?event gnd:precedingConferenceOrEvent ?prec
# ?event gnd:succeedingConferenceOrEvent ?succ
}
}
GROUP BY ?event ?eventId
#HAVING(COUNT(?title) = 1)
}
SPARQL Queries
entities and usage frequency
# get histogramm data of entities by
# usage frequency
# WF 2020-06-27
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
SELECT ?c (COUNT(?c) AS ?count)
WHERE {
?subject a ?c
}
GROUP BY ?c
HAVING (?count >100)
ORDER BY DESC(?count)
c |
count
|
gnd#ConferenceOrEvent |
713310
|
gnd#TerritorialCorporateBodyOrAdministrativeUnit |
188246
|
gnd#SeriesOfConferenceOrEvent |
122970
|
gnd#BuildingOrMemorial |
67149
|
http://www.opengis.net/ont/sf#Point |
57987
|
gnd#PlaceOrGeographicName |
27771
|
gnd#NaturalGeographicUnit |
20269
|
gnd#AdministrativeUnit |
12846
|
gnd#WayBorderOrLine |
4971
|
gnd#ReligiousTerritory |
2646
|
gnd#NameOfSmallGeographicUnitLyingWithinAnotherGeographicUnit |
2113
|
gnd#CorporateBody |
559
|
gnd#MemberState |
543
|
gnd#Country |
307
|
gnd#ExtraterrestrialTerritory |
282
|
gnd#Language |
193
|
gnd#ReligiousCorporateBody |
155
|
gnd#ReligiousAdministrativeUnit |
134
|
gnd#HistoricSingleEventOrEra |
128
|
relevance of fields
# get histogramm data of properties by
# usage frequency
# WF 2020-07-12
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>
SELECT ?property (COUNT(?property) AS ?propTotal)
WHERE { ?s ?property ?o . }
GROUP BY ?property
HAVING (?propTotal >1000)
ORDER BY DESC(?propTotal)
property |
propTotal
|
http://www.w3.org/2002/07/owl#sameAs |
1359115
|
http://www.w3.org/1999/02/22-rdf-syntax-ns#type |
1222895
|
gnd#geographicAreaCode |
1195102
|
http://purl.org/dc/terms/license |
1149468
|
http://purl.org/dc/terms/modified |
1149468
|
http://www.w3.org/2007/05/powder-s#describedby |
1149468
|
gnd#gndIdentifier |
1149468
|
gnd#oldAuthorityNumber |
940550
|
gnd#preferredNameForTheConferenceOrEvent |
836397
|
gnd#variantNameForTheConferenceOrEvent |
720355
|
gnd#dateOfConferenceOrEvent |
693196
|
gnd#placeOfConferenceOrEvent |
650165
|
gnd#preferredNameForThePlaceOrGeographicName |
313058
|
gnd#variantNameForThePlaceOrGeographicName |
298383
|
gnd#gndSubjectCategory |
176395
|
gnd#definition |
157695
|
gnd#broaderTermInstantial |
117334
|
gnd#place |
82345
|
http://xmlns.com/foaf/0.1/page |
74542
|
http://www.opengis.net/ont/geosparql#asWKT |
58083
|
http://www.opengis.net/ont/geosparql#hasGeometry |
58083
|
https://d-nb.info/standards/elementset/dnb#deprecatedUri |
51576
|
gnd#biographicalOrHistoricalInformation |
34190
|
gnd#organizerOrHost |
34167
|
gnd#relatedDdcWithDegreeOfDeterminacy2 |
25452
|
gnd#succeedingPlaceOrGeographicName |
25208
|
gnd#broaderTermPartitive |
25111
|
gnd#dateOfEstablishment |
24772
|
gnd#homepage |
24439
|
gnd#precedingPlaceOrGeographicName |
22490
|
gnd#dateOfTermination |
21582
|
gnd#hierarchicalSuperiorOfPlaceOrGeographicName |
20522
|
gnd#precedingConferenceOrEvent |
17164
|
gnd#succeedingConferenceOrEvent |
16963
|
gnd#dateOfProduction |
15106
|
gnd#spatialAreaOfActivity |
13593
|
gnd#hierarchicalSuperiorOfTheConferenceOrEvent |
10126
|
gnd#architect |
8206
|
gnd#topic |
6088
|
gnd#relatedPlaceOrGeographicName |
5812
|
gnd#abbreviatedNameForTheConferenceOrEvent |
5312
|
gnd#complexSeeReferenceSubject |
3927
|
gnd#startingOrFinalPointOfADistance |
3531
|
gnd#relatedConferenceOrEvent |
3124
|
gnd#relatedDdcWithDegreeOfDeterminacy4 |
2709
|
gnd#relatedCorporateBody |
2354
|
gnd#relatedTerm |
1728
|
gnd#sponsorOrPatron |
1281
|
gnd#relatedDdcWithDegreeOfDeterminacy3 |
1200
|
gnd#exhibitor |
1044
|
done
|
2020-06-25"2020-06-25" is not recognized as a Boolean (true/false) value.
|
todo
|
Unfortunately the headlines and the forms seem to be mixed up.
|
events with most often used fields and seldom but useful fields
# get events with most often used columns from GND
# plus acronym, topic, homepage (seldom but useful)
# WF 2020-07-12
PREFIX gndi: <https://d-nb.info/gnd>
PREFIX gnd: <https://d-nb.info/standards/elementset/gnd#>
PREFIX gndo: <https://d-nb.info/standards/vocab/gnd/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/terms/>
PREFIX wdrs: <http://www.w3.org/2007/05/powder-s#>
SELECT ?event ?eventId ?acronym ?variant ?name ?date ?areaCode ?place ?topic ?homepage
WHERE {
?event gnd:gndIdentifier ?eventId.
OPTIONAL { ?event gnd:abbreviatedNameForTheConferenceOrEvent ?acronym. }
OPTIONAL { ?event gnd:variantNameForTheConferenceOrEvent ?variant.}
OPTIONAL { ?event gnd:preferredNameForTheConferenceOrEvent ?name.}
OPTIONAL { ?event gnd:dateOfConferenceOrEvent ?date. }
OPTIONAL { ?event gnd:geographicAreaCode ?areaCode. }
OPTIONAL { ?event gnd:placeOfConferenceOrEvent ?place. }
OPTIONAL { ?event gnd:topic ?topic. }
OPTIONAL { ?event gnd:homepage ?homepage. }
}
#LIMIT 10000
gnd2jena
#!/bin/bash
# WF 2020-05-10
# global settings
jena=apache-jena-4.4.0
tgz=$jena.tar.gz
#mirror=https://downloads.apache.org/jena/binaries
mirror=https://archive.apache.org/dist/jena/binaries
jenaurl=$mirror/$tgz
base=/hd/seel/gnd
#base=/hd/torterra/gnd
#base=/hd/luxio/gnd
data=$base/data
tdbloader=$jena/bin/tdb2.tdbloader
getjena() {
# download
if [ ! -f $tgz ]
then
echo "downloading $tgz from $jenaurl"
wget $jenaurl
else
echo "$tgz already downloaded"
fi
# unpack
if [ ! -d $jena ]
then
echo "unpacking $jena from $tgz"
tar xvzf $tgz
else
echo "$jena already unpacked"
fi
# create data directory
if [ ! -d $data ]
then
echo "creating $data directory"
mkdir -p $data
else
echo "$data directory already created"
fi
}
#
# show the given timestamp
#
timestamp() {
local msg="$1"
local ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
echo "$msg at $ts"
}
#
# load data for the given data dir and input
#
loaddata() {
local data="$1"
local input="$2"
timestamp "start loading $input to $data"
$tdbloader --loc "$data" "$input" > tdb2-$phase-out.log 2> tdb2-$phase-err.log
timestamp "finished loading $input to $data"
}
getjena
export TMPDIR=$base/tmp
if [ ! -d $TMPDIR ]
then
echo "creating temporary directory $TMPDIR"
mkdir $TMPDIR
else
echo "using temporary directory $TMPDIR"
fi
for d in kongress geografikum
do
file=authorities-${d}_lds.ttl
if [ ! -f $file ]
then
wget https://data.dnb.de/opendata/$file.gz
gunzip $file.gz
else
echo "$file already downloaded"
fi
loaddata $data $file
done
try on confident23 server
2020-07-19
wf@confident23:/usr/local/src$ sudo ./getjena
apache-jena-3.16.0.tar.gz already downloaded
apache-jena-3.16.0 already unpacked
creating /var/data/gnd/data directory
creating temporary directory /var/data/gnd/tmp
start loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-07-19T07:08:15Z
finished loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-07-19T07:11:16Z
2020-09-15
./getjena
apache-jena-3.16.0.tar.gz already downloaded
apache-jena-3.16.0 already unpacked
creating /var/data/gnd/data directory
creating temporary directory /var/data/gnd/tmp
--2020-09-15 11:46:37-- https://data.dnb.de/opendata/authorities-kongress_lds.ttl.gz
Resolving data.dnb.de (data.dnb.de)... 193.175.100.140
Connecting to data.dnb.de (data.dnb.de)|193.175.100.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80381713 (77M) [application/x-gzip]
Saving to: ‘authorities-kongress_lds.ttl.gz’
authorities-kongres 100%[===================>] 76.66M 66.3MB/s in 1.2s
2020-09-15 11:46:38 (66.3 MB/s) - ‘authorities-kongress_lds.ttl.gz’ saved [80381713/80381713]
start loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-09-15T09:46:42Z
finished loading authorities-kongress_lds.ttl to /var/data/gnd/data at 2020-09-15T09:50:25Z
--2020-09-15 11:50:25-- https://data.dnb.de/opendata/authorities-geografikum_lds.ttl.gz
Resolving data.dnb.de (data.dnb.de)... 193.175.100.140
Connecting to data.dnb.de (data.dnb.de)|193.175.100.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33842092 (32M) [application/x-gzip]
Saving to: ‘authorities-geografikum_lds.ttl.gz’
authorities-geograf 100%[===================>] 32.27M 53.4MB/s in 0.6s
2020-09-15 11:50:26 (53.4 MB/s) - ‘authorities-geografikum_lds.ttl.gz’ saved [33842092/33842092]
start loading authorities-geografikum_lds.ttl to /var/data/gnd/data at 2020-09-15T09:50:27Z
finished loading authorities-geografikum_lds.ttl to /var/data/gnd/data at 2020-09-15T09:52:40Z
start fuseki
#!/bin/bash
# WF 2020-06-25
# Jena Fuseki server installation
# see https://jena.apache.org/documentation/fuseki2/fuseki-run.html
version=3.16.0
fuseki=apache-jena-fuseki-$version
if [ ! -d $fuseki ]
then
if [ ! -f $fuseki.tar.gz ]
then
wget http://archive.apache.org/dist/jena/binaries/$fuseki.tar.gz
else
echo $fuseki.tar.gz already downloaded
fi
echo "unpacking $fuseki.tar.gz"
tar xvfz $fuseki.tar.gz
else
echo $fuseki already downloaded and unpacked
fi
cd $fuseki
gnddata=/var/data/gnd/data
java -jar fuseki-server.jar --tdb2 --loc=$gnddata /gnd
wf@confident23:/usr/local/src$