From 4ea729a6590ead40ac800d5592b22e5c7bc0a16a Mon Sep 17 00:00:00 2001 From: "Francesca.L.Bleken@sintef.no" Date: Tue, 16 Sep 2025 14:50:20 +0200 Subject: [PATCH 01/19] Add criteria with no object --- tests/datadoc/test_dataset.py | 28 ++++++++++++++++++++++++- tests/input/semdata.yaml | 11 ++++++++++ tripper/datadoc/dataset.py | 39 ++++++++++++++++++++++++++++------- 3 files changed, 69 insertions(+), 9 deletions(-) diff --git a/tests/datadoc/test_dataset.py b/tests/datadoc/test_dataset.py index d1bb2294..43f89fb9 100644 --- a/tests/datadoc/test_dataset.py +++ b/tests/datadoc/test_dataset.py @@ -538,6 +538,30 @@ def test_datadoc(): SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], } + # Filter on criterion, but without required value + assert set( + search( + ts, + criteria={"creator.name": ""}, + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + SEMDATA["SEM_cement_batch2/77600-23-001"], + SEMDATA["SEM_cement_batch2"], + } + + # Filter on criterion, but without required value + assert set( + search( + ts, + criteria={"creator.name": None}, + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + SEMDATA["SEM_cement_batch2/77600-23-001"], + SEMDATA["SEM_cement_batch2"], + } + with pytest.raises(NoSuchTypeError): search(ts, type="invalid-type") @@ -545,10 +569,12 @@ def test_datadoc(): assert set(search(ts, regex={"dcterms:title": "SEM images"})) == { SEMDATA.SEM_cement_batch2, SAMPLE["SEM_cement_batch2/77600-23-001"], + SEMDATA.SEM_cement_missingcreator, } assert set(search(ts, regex={"dcterms:title": "SEM i[^ ]*s"})) == { SEMDATA.SEM_cement_batch2, SAMPLE["SEM_cement_batch2/77600-23-001"], + SEMDATA.SEM_cement_missingcreator, } # Get individual with given IRI @@ -627,7 +653,7 @@ def test_validate(): def test_pipeline(): """Test creating OTEAPI pipeline.""" - pytest.skip() + # pytest.skip() from tripper import Triplestore diff --git a/tests/input/semdata.yaml b/tests/input/semdata.yaml index ce66d8d1..0835f1c7 100644 --- a/tests/input/semdata.yaml +++ b/tests/input/semdata.yaml @@ -73,6 +73,17 @@ Dataset: downloadURL: sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2 mediaType: inode/directory + - "@id": semdata:SEM_cement_missingcreator + "@type": sem:SEMImageSeries + title: Nested series of SEM images which is missing a creator + description: ... + + distribution: + downloadURL: sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch_missingcreator + mediaType: inode/directory + + + Parser: - "@id": par:sem_hitachi diff --git a/tripper/datadoc/dataset.py b/tripper/datadoc/dataset.py index 9542f7d1..a81b1253 100644 --- a/tripper/datadoc/dataset.py +++ b/tripper/datadoc/dataset.py @@ -1336,15 +1336,19 @@ def make_query( if not isinstance(typ, str): typ = typ[0] crit.append(f"?iri rdf:type <{ts.expand_iri(typ)}> .") # type: ignore + print(filters) def add_crit(k, v, regex=False, s="iri"): """Add criteria to SPARQL query.""" nonlocal n + + print(k, v) key = f"@{k[1:]}" if k.startswith("_") else k if isinstance(v, list): for ele in v: add_crit(key, ele, regex=regex, s=s) return + print(key, k, v) if re.match(r"^[_a-zA-Z0.9]+\.", key): newkey, restkey = key.split(".", 1) if newkey in expanded: @@ -1369,12 +1373,18 @@ def add_crit(k, v, regex=False, s="iri"): n += 1 var = f"v{n}" crit.append(f"?{s} <{ts.expand_iri(key)}> ?{var} .") - if regex: - filters.append( - f"FILTER REGEX(STR(?{var}), {value}{flags_arg}) ." - ) - else: - filters.append(f"FILTER(STR(?{var}) = {value}) .") + if value not in ["", None, '""']: + print(f"value is: {value}") + + if regex: + filters.append( + f"FILTER REGEX(STR(?{var}), {value}{flags_arg}) ." + ) + else: + filters.append(f"FILTER(STR(?{var}) = {value}) .") + print("-----------------") + print(filters) + print("----------------------") for k, v in criteria.items(): add_crit(k, v) @@ -1393,6 +1403,9 @@ def add_crit(k, v, regex=False, s="iri"): {where_statements} }} """ + print("====================") + print(query) + print("====================") return query @@ -1411,9 +1424,12 @@ def search( Arguments: ts: Triplestore to search. type: Either a [resource type] (ex: "Dataset", "Distribution", ...) - or the IRI of a class to limit the search to. + or the IRI of a class to limit the search to. Can also be given + as a list of resource types or IRIs. criteria: Exact match criteria. A dict of IRI, value pairs, where the - IRIs refer to data properties on the resource match. The IRIs + IRIs refer to data properties on the resource match. If more than + one value is desire for a given criteria, values can be provided + in a list. The IRIs may use any prefix defined in `ts`. E.g. if the prefix `dcterms` is in `ts`, it is expanded and the match criteria `dcterms:title` is correctly parsed. @@ -1442,10 +1458,17 @@ def search( search(ts, criteria={"contactPoint.hasName": "John Doe"}) + List IRIs of all resources with John Doe and Jane Doe as `contactPoint`: + + search(ts, criteria={"contactPoint.hasName": ["John Doe", "Jane Doe"]}) + List IRIs of all samples: search(ts, type=CHAMEO.Sample) + List IRIs of all samples that are liquids: + search(ts, type=[CHAMEO.Sample, EMMO.Liquid] ) + List IRIs of all datasets with John Doe as `contactPoint` AND are measured on a given sample: From b62fba73698043da74818e81d5328e4744692618 Mon Sep 17 00:00:00 2001 From: "Francesca.L.Bleken@sintef.no" Date: Tue, 16 Sep 2025 18:13:07 +0200 Subject: [PATCH 02/19] None in key for criteria --- tests/datadoc/test_dataset.py | 35 +++++++++++ tests/input/semdata.yaml | 6 +- tripper/datadoc/dataset.py | 111 ++++++++++++++++++++++++++++++---- 3 files changed, 139 insertions(+), 13 deletions(-) diff --git a/tests/datadoc/test_dataset.py b/tests/datadoc/test_dataset.py index 43f89fb9..12720903 100644 --- a/tests/datadoc/test_dataset.py +++ b/tests/datadoc/test_dataset.py @@ -562,6 +562,41 @@ def test_datadoc(): SEMDATA["SEM_cement_batch2"], } + # Filter on criterion, but with any predicate + assert set( + search( + ts, + criteria={None: ["Named Lab Assistant"]}, + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + SEMDATA["SEM_cement_missingcreator"], + } + + # Filter on criterion, but with any predicate + assert set( + search( + ts, + criteria={None: "Named Lab Assistant"}, + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + SEMDATA["SEM_cement_missingcreator"], + } + + # Filter on more criteria with any predicate, testlabel tests that + # indirect search through inSeries works. + assert set( + search( + ts, + criteria={None: ["Named Lab Assistant", "testlabel"]}, + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + SEMDATA["SEM_cement_missingcreator"], + SEMDATA["SEM_cement_batch2/77600-23-001"], + } + with pytest.raises(NoSuchTypeError): search(ts, type="invalid-type") diff --git a/tests/input/semdata.yaml b/tests/input/semdata.yaml index 0835f1c7..176cf8f4 100644 --- a/tests/input/semdata.yaml +++ b/tests/input/semdata.yaml @@ -9,6 +9,7 @@ prefixes: dm: http://onto-ns.com/meta/characterisation/0.1/SEMImage# par: http://sintef.no/dlite/parser# gen: http://sintef.no/dlite/generator# + chameo: https://w3id.org/emmo/domain/characterisation-methodology/chameo# # List of documented datasets @@ -68,7 +69,7 @@ Dataset: contactPoint: hasName: Sigurd Wenner hasEmail: - + label: testlabel distribution: downloadURL: sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2 mediaType: inode/directory @@ -77,6 +78,8 @@ Dataset: "@type": sem:SEMImageSeries title: Nested series of SEM images which is missing a creator description: ... + curator: + - name: Named Lab Assistant distribution: downloadURL: sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch_missingcreator @@ -85,6 +88,7 @@ Dataset: + Parser: - "@id": par:sem_hitachi "@type": oteio:Parser diff --git a/tripper/datadoc/dataset.py b/tripper/datadoc/dataset.py index a81b1253..b91e2b77 100644 --- a/tripper/datadoc/dataset.py +++ b/tripper/datadoc/dataset.py @@ -1336,19 +1336,93 @@ def make_query( if not isinstance(typ, str): typ = typ[0] crit.append(f"?iri rdf:type <{ts.expand_iri(typ)}> .") # type: ignore - print(filters) def add_crit(k, v, regex=False, s="iri"): """Add criteria to SPARQL query.""" nonlocal n - print(k, v) - key = f"@{k[1:]}" if k.startswith("_") else k + def _to_value_token(x): + # Turn a Python value into a SPARQL term + if x in expanded: + return f"<{expanded[x]}>" + if isinstance(x, str): + return ( + f"<{x}>" + if re.match("^[a-z][a-z0-9.+-]*://", x) + else f'"{x}"' + ) + return x + + key = None if k is None else (f"@{k[1:]}" if k.startswith("_") else k) + + if key is None: + # any predicate on first hop; keep ?s (= ?iri) as the resource + + n += 1 + pvar = f"p{n}" + bn = f"bn{n}" + n += 1 + qvar = f"q{n}" + var = f"v{n}" + + # ?s ?p ?bn . ?bn ?q ?var . + crit.append(f"?{s} ?{pvar} ?{bn} .") + crit.append(f"?{bn} ?{qvar} ?{var} .") + # Only return non-blank subjects + if s == "iri": + filters.append("FILTER(!isBlank(?iri)) .") + + # Support list of values → VALUES (equality) or a single alternation for regex + if isinstance(v, list): + if regex: + pattern = "(" + "|".join(str(p) for p in v) + ")" + filters.append( + f'FILTER REGEX(STR(?{var}), "{pattern}"{flags_arg}) .' + ) + else: + vals = [] + for ele in v: + if ele in expanded: + vals.append(f"<{expanded[ele]}>") + elif isinstance(ele, str): + vals.append( + f"<{ele}>" + if re.match("^[a-z][a-z0-9.+-]*://", ele) + else f'"{ele}"' + ) + elif ele not in ("", None): + vals.append(ele) + if vals: + crit.append(f"VALUES ?{var} {{ {' '.join(vals)} }}") + else: + # single value + if v in expanded: + value = f"<{expanded[v]}>" + elif isinstance(v, str): + value = ( + f"<{v}>" + if re.match("^[a-z][a-z0-9.+-]*://", v) + else f'"{v}"' + ) + else: + value = v + if value not in ["", None, '""']: + if regex: + filters.append( + f"FILTER REGEX(STR(?{var}), {value}{flags_arg}) ." + ) + else: + # If it's an IRI token, compare directly; otherwise compare STR() + if isinstance(value, str) and value.startswith("<"): + filters.append(f"FILTER(?{var} = {value}) .") + else: + filters.append(f"FILTER(STR(?{var}) = {value}) .") + return + if isinstance(v, list): for ele in v: add_crit(key, ele, regex=regex, s=s) return - print(key, k, v) if re.match(r"^[_a-zA-Z0.9]+\.", key): newkey, restkey = key.split(".", 1) if newkey in expanded: @@ -1373,8 +1447,8 @@ def add_crit(k, v, regex=False, s="iri"): n += 1 var = f"v{n}" crit.append(f"?{s} <{ts.expand_iri(key)}> ?{var} .") + if value not in ["", None, '""']: - print(f"value is: {value}") if regex: filters.append( @@ -1382,9 +1456,6 @@ def add_crit(k, v, regex=False, s="iri"): ) else: filters.append(f"FILTER(STR(?{var}) = {value}) .") - print("-----------------") - print(filters) - print("----------------------") for k, v in criteria.items(): add_crit(k, v) @@ -1395,6 +1466,8 @@ def add_crit(k, v, regex=False, s="iri"): for k, v in regex.items(): add_crit(k, v, regex=True) + # Make sure that iris are iris (not blank nodes) + filters.append("FILTER(!isBlank(?iri)) .") where_statements = "\n ".join(crit + filters) query = f""" PREFIX rdf: <{RDF}> @@ -1403,9 +1476,6 @@ def add_crit(k, v, regex=False, s="iri"): {where_statements} }} """ - print("====================") - print(query) - print("====================") return query @@ -1428,11 +1498,20 @@ def search( as a list of resource types or IRIs. criteria: Exact match criteria. A dict of IRI, value pairs, where the IRIs refer to data properties on the resource match. If more than - one value is desire for a given criteria, values can be provided + one value is desired for a given criterion, values can be provided in a list. The IRIs may use any prefix defined in `ts`. E.g. if the prefix `dcterms` is in `ts`, it is expanded and the match criteria `dcterms:title` is correctly parsed. + + If the object (value) is given as None or "", all matches + that have any value for the given criterion are returned. + + If predicate (key) is given as None search on all objects irrespective + of predicate is performed. + + Note that more than one value broadens the + search, i.e. it is an OR operation. regex: Like `criteria` but the values in the provided dict are regular expressions used for the matching. flags: Flags passed to regular expressions. @@ -1462,6 +1541,14 @@ def search( search(ts, criteria={"contactPoint.hasName": ["John Doe", "Jane Doe"]}) + List IRIs of all resources that have a `contactPoint`: + + search(ts, criteria={"contactPoint.hasName": None}) + + List IRIs of all resources that have Jane Doe or Blue as object (value): + + search(ts, criteria={None: ["Jane Doe", "Blue"]}) + List IRIs of all samples: search(ts, type=CHAMEO.Sample) From d5cb0737d3b43975801052cbc56cfa2d77846a3a Mon Sep 17 00:00:00 2001 From: "Francesca.L.Bleken@sintef.no" Date: Wed, 17 Sep 2025 16:11:43 +0200 Subject: [PATCH 03/19] Only None accepted as wildcard in search. --- tests/datadoc/test_dataset.py | 12 ------------ tripper/datadoc/dataset.py | 10 +++++----- 2 files changed, 5 insertions(+), 17 deletions(-) diff --git a/tests/datadoc/test_dataset.py b/tests/datadoc/test_dataset.py index 12720903..9f487fdc 100644 --- a/tests/datadoc/test_dataset.py +++ b/tests/datadoc/test_dataset.py @@ -538,18 +538,6 @@ def test_datadoc(): SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], } - # Filter on criterion, but without required value - assert set( - search( - ts, - criteria={"creator.name": ""}, - ) - ) == { - SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], - SEMDATA["SEM_cement_batch2/77600-23-001"], - SEMDATA["SEM_cement_batch2"], - } - # Filter on criterion, but without required value assert set( search( diff --git a/tripper/datadoc/dataset.py b/tripper/datadoc/dataset.py index b91e2b77..053147dc 100644 --- a/tripper/datadoc/dataset.py +++ b/tripper/datadoc/dataset.py @@ -1406,7 +1406,7 @@ def _to_value_token(x): ) else: value = v - if value not in ["", None, '""']: + if value: if regex: filters.append( f"FILTER REGEX(STR(?{var}), {value}{flags_arg}) ." @@ -1448,7 +1448,7 @@ def _to_value_token(x): var = f"v{n}" crit.append(f"?{s} <{ts.expand_iri(key)}> ?{var} .") - if value not in ["", None, '""']: + if value: if regex: filters.append( @@ -1504,10 +1504,10 @@ def search( is in `ts`, it is expanded and the match criteria `dcterms:title` is correctly parsed. - If the object (value) is given as None or "", all matches - that have any value for the given criterion are returned. + If the object (value) is given as None, all matches + that have any value for the given predicate are returned. - If predicate (key) is given as None search on all objects irrespective + If predicate (key) is given as None, search on all objects irrespective of predicate is performed. Note that more than one value broadens the From 609acb4f40e2b9f802e34a35992565e4d584dea6 Mon Sep 17 00:00:00 2001 From: "Francesca.L.Bleken@sintef.no" Date: Thu, 18 Sep 2025 09:13:10 +0200 Subject: [PATCH 04/19] Added option for adding critiera as tuples --- tests/datadoc/test_dataset.py | 23 ++++++++++++++++ tripper/datadoc/dataset.py | 52 ++++++++++++++++++++++++++--------- 2 files changed, 62 insertions(+), 13 deletions(-) diff --git a/tests/datadoc/test_dataset.py b/tests/datadoc/test_dataset.py index 9f487fdc..d49d1c92 100644 --- a/tests/datadoc/test_dataset.py +++ b/tests/datadoc/test_dataset.py @@ -585,6 +585,29 @@ def test_datadoc(): SEMDATA["SEM_cement_batch2/77600-23-001"], } + # Filter on two different criteria in a dict) + assert set( + search( + ts, + criteria={"creator.name": "Sigurd Wenner", "label": "testlabel"}, + ) + ) == { + SEMDATA["SEM_cement_batch2"], + } + + # Filter on two different criteria in a list of tuples + assert set( + search( + ts, + criteria=[ + ("creator.name", "Sigurd Wenner"), + ("label", "testlabel"), + ], + ) + ) == { + SEMDATA["SEM_cement_batch2"], + } + with pytest.raises(NoSuchTypeError): search(ts, type="invalid-type") diff --git a/tripper/datadoc/dataset.py b/tripper/datadoc/dataset.py index 053147dc..8dd4e09a 100644 --- a/tripper/datadoc/dataset.py +++ b/tripper/datadoc/dataset.py @@ -31,12 +31,14 @@ from __future__ import annotations -# pylint: disable=invalid-name,redefined-builtin,import-outside-toplevel -# pylint: disable=too-many-branches import json import logging import re import warnings + +# pylint: disable=invalid-name,redefined-builtin,import-outside-toplevel +# pylint: disable=too-many-branches +from itertools import groupby from pathlib import Path from typing import TYPE_CHECKING @@ -1272,7 +1274,7 @@ def make_query( ts: Triplestore, type=None, criterias: "Optional[dict]" = None, # deprecated - criteria: "Optional[dict]" = None, # new preferred name + criteria: "Optional[Union[dict, list[tuple]]]" = None, # new preferred name regex: "Optional[dict]" = None, flags: "Optional[str]" = None, keywords: "Optional[Keywords]" = None, @@ -1297,6 +1299,14 @@ def make_query( if criteria is None: criteria = criterias + if isinstance(criteria, list): + criteria.sort(key=lambda x: x[0]) + res = { + key: [value for key, value in group] + for key, group in groupby(criteria, key=lambda x: x[0]) + } + criteria = res + keywords = get_keywords(keywords=keywords) context = get_context(keywords=keywords) context._create_caches() # pylint: disable=protected-access @@ -1317,7 +1327,7 @@ def make_query( cid = criteria.pop("@id", criteria.pop("_id", None)) rid = regex.pop("@id", regex.pop("_id", None)) if cid: - filters.append(f'FILTER(STR(?iri) = "{ts.expand_iri(cid)}") .') + filters.append(f'FILTER(STR(?iri) = "{ts.expand_iri(cid)}") .') # type: ignore elif rid: filters.append( f'FILTER REGEX(STR(?iri), "{ts.expand_iri(rid)}"{flags_arg}) .' @@ -1483,7 +1493,7 @@ def search( ts: Triplestore, type=None, criterias: "Optional[dict]" = None, # deprecated - criteria: "Optional[dict]" = None, # new preferred name + criteria: "Optional[Union[list[tuple], dict]]" = None, # new preferred name regex: "Optional[dict]" = None, flags: "Optional[str]" = None, keywords: "Optional[Keywords]" = None, @@ -1499,8 +1509,10 @@ def search( criteria: Exact match criteria. A dict of IRI, value pairs, where the IRIs refer to data properties on the resource match. If more than one value is desired for a given criterion, values can be provided - in a list. The IRIs - may use any prefix defined in `ts`. E.g. if the prefix `dcterms` + in a list. It can also be given as a list of (key, value) tuples. + A combination of tuples and dict is not supported. + + The IRIsmay use any prefix defined in `ts`. E.g. if the prefix `dcterms` is in `ts`, it is expanded and the match criteria `dcterms:title` is correctly parsed. @@ -1510,13 +1522,16 @@ def search( If predicate (key) is given as None, search on all objects irrespective of predicate is performed. - Note that more than one value broadens the + Note that more than one value for a given key broadens the search, i.e. it is an OR operation. - regex: Like `criteria` but the values in the provided dict are regular - expressions used for the matching. - flags: Flags passed to regular expressions. - - `s`: Dot-all mode. The . matches any character. The default - doesn't match newline or carriage return. + + The different key-value pairs in the dict are combined with AND. + + regex: Like `criteria` but the values in the provided dict are regular + expressions used for the matching. + flags: Flags passed to regular expressions. + - `s`: Dot-all mode. The . matches any character. The default + doesn't match newline or carriage return. - `m`: Multi-line mode. The ^ and $ characters matches beginning or end of line instead of beginning or end of string. - `i`: Case-insensitive mode. @@ -1549,6 +1564,15 @@ def search( search(ts, criteria={None: ["Jane Doe", "Blue"]}) + Search with critera given as list of tuples: + search( + ts, + criteria=[ + ("contactPoint.hasName", "John Doe"), + ("fromSample", SAMPLE.batch2/sample3), + ], + ) + List IRIs of all samples: search(ts, type=CHAMEO.Sample) @@ -1556,6 +1580,8 @@ def search( List IRIs of all samples that are liquids: search(ts, type=[CHAMEO.Sample, EMMO.Liquid] ) + + List IRIs of all datasets with John Doe as `contactPoint` AND are measured on a given sample: From 3526b231245bfd6a3522e00e307860309b0ed224 Mon Sep 17 00:00:00 2001 From: "Francesca.L.Bleken@sintef.no" Date: Thu, 18 Sep 2025 15:55:38 +0200 Subject: [PATCH 05/19] Removed helper function which is not used --- tripper/datadoc/dataset.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tripper/datadoc/dataset.py b/tripper/datadoc/dataset.py index 8dd4e09a..72cb3f5d 100644 --- a/tripper/datadoc/dataset.py +++ b/tripper/datadoc/dataset.py @@ -1351,18 +1351,6 @@ def add_crit(k, v, regex=False, s="iri"): """Add criteria to SPARQL query.""" nonlocal n - def _to_value_token(x): - # Turn a Python value into a SPARQL term - if x in expanded: - return f"<{expanded[x]}>" - if isinstance(x, str): - return ( - f"<{x}>" - if re.match("^[a-z][a-z0-9.+-]*://", x) - else f'"{x}"' - ) - return x - key = None if k is None else (f"@{k[1:]}" if k.startswith("_") else k) if key is None: From 3a1813caa47d4f4a7a12c0fdf418d38834de6d46 Mon Sep 17 00:00:00 2001 From: "Francesca.L.Bleken@sintef.no" Date: Fri, 26 Sep 2025 16:05:52 +0200 Subject: [PATCH 06/19] Corrected sorting of tuples with None --- tests/datadoc/test_dataset.py | 14 ++++++++++++++ tripper/datadoc/dataset.py | 3 ++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/datadoc/test_dataset.py b/tests/datadoc/test_dataset.py index d49d1c92..0daf73f7 100644 --- a/tests/datadoc/test_dataset.py +++ b/tests/datadoc/test_dataset.py @@ -608,6 +608,20 @@ def test_datadoc(): SEMDATA["SEM_cement_batch2"], } + assert set( + search( + ts, + criteria=[ + (None, "Sigurd Wenner"), + (None, "testlabel"), + ], + ) + ) == { + SEMDATA["SEM_cement_batch2"], + SEMDATA["SEM_cement_batch2/77600-23-001"], + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + } + with pytest.raises(NoSuchTypeError): search(ts, type="invalid-type") diff --git a/tripper/datadoc/dataset.py b/tripper/datadoc/dataset.py index 72cb3f5d..b86cc848 100644 --- a/tripper/datadoc/dataset.py +++ b/tripper/datadoc/dataset.py @@ -1300,7 +1300,8 @@ def make_query( criteria = criterias if isinstance(criteria, list): - criteria.sort(key=lambda x: x[0]) + criteria = sorted(criteria, key=lambda x: (x[0] is None, x[0])) + res = { key: [value for key, value in group] for key, group in groupby(criteria, key=lambda x: x[0]) From a985c815d7404d25d7354ad4b55d8a632f70c8ee Mon Sep 17 00:00:00 2001 From: "Francesca.L.Bleken@sintef.no" Date: Fri, 26 Sep 2025 19:49:16 +0200 Subject: [PATCH 07/19] Added test that shows errors in dataset --- tests/datadoc/test_dataset.py | 120 +++++++++++++++++++++++++++------- 1 file changed, 96 insertions(+), 24 deletions(-) diff --git a/tests/datadoc/test_dataset.py b/tests/datadoc/test_dataset.py index 0daf73f7..dd02e8b5 100644 --- a/tests/datadoc/test_dataset.py +++ b/tests/datadoc/test_dataset.py @@ -8,6 +8,9 @@ pytest.importorskip("requests") pytest.importorskip("pyld") +GRAPHDB_CHECK_URL = "http://localhost:7200/repositories" +FUSEKI_CHECK_URL = "http://localhost:3030" + def test__get_range(): """Test _get_default_keywords().""" @@ -402,20 +405,20 @@ def test_update_classes(): } in r3["subClassOf"] -def test_datadoc(): +def datasettest(name): """Test save_datadoc() and acquire()/store().""" # pylint: disable=too-many-statements from dataset_paths import indir # pylint: disable=import-error - from tripper import CHAMEO, DCAT, DCTERMS, EMMO, OTEIO, Triplestore + from tripper import CHAMEO, DCAT, DCTERMS, EMMO, OTEIO from tripper.datadoc import acquire, save_datadoc, search, store from tripper.datadoc.errors import NoSuchTypeError pytest.importorskip("dlite") pytest.importorskip("rdflib") - ts = Triplestore("rdflib") + ts = get_triplestore(name) # Load data documentation into triplestore datadoc = save_datadoc(ts, indir / "semdata.yaml") @@ -427,6 +430,8 @@ def test_datadoc(): SEMDATA = ts.namespaces["semdata"] iri = SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"] d = acquire(ts, iri, use_sparql=False) + print("----") + print(d) assert d["@id"] == iri assert set(d["@type"]) == { DCAT.Dataset, @@ -622,6 +627,31 @@ def test_datadoc(): SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], } + assert set( + search( + ts, + criteria=[ + (None, "http://onto-ns.com/meta/matchmaker/0.2/SEMImage"), + ], + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + } + + assert set( + search( + ts, + criteria=[ + ( + "https://w3id.org/emmo/domain/oteio#hasDatamodel", + "http://onto-ns.com/meta/matchmaker/0.2/SEMImage", + ), + ], + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + } + with pytest.raises(NoSuchTypeError): search(ts, type="invalid-type") @@ -746,27 +776,6 @@ def test_pipeline(): pipeline.get() -def test_fuseki(): - """Test save and load dataset with Fuseki.""" - import os - - from tripper import Triplestore - - host = os.getenv("TRIPLESTORE_HOST", "localhost") - port = os.getenv("TRIPLESTORE_PORT", "3030") - fuseki_args = { - "backend": "fusekix", - "base_iri": "http://example.com/ontology#", - "triplestore_url": f"http://{host}:{port}", - "database": "openmodel", - } - try: - ts = Triplestore(**fuseki_args) - except ModuleNotFoundError: - pytest.skip("Cannot connect to Fuseki server") - ts.remove_database(**fuseki_args) - - def test_deprecated(): """Test deprecated save_dict(), load_dict() and search_iris().""" from tripper import Triplestore @@ -799,3 +808,66 @@ def test_deprecated(): with pytest.warns(DeprecationWarning): iris = search_iris(ts, criterias={"creator.name": "John Doe"}) assert iris == [EX.exdata] + + +def get_triplestore(tsname: str) -> "Triplestore": + """Help function that returns a new triplestore object.""" + from tripper import Triplestore + + if tsname == "GraphDB": + ts = Triplestore( + backend="sparqlwrapper", + base_iri="http://localhost:7200/repositories/test_repo", + update_iri=( + "http://localhost:7200/repositories/test_repo/statements" + ), + ) + elif tsname == "Fuseki": + ts = Triplestore( + backend="sparqlwrapper", + base_iri=f"{FUSEKI_CHECK_URL}/test_repo", + update_iri=f"{FUSEKI_CHECK_URL}/test_repo/update", + username="admin", + password="admin0", + ) + elif tsname == "rdflib": + ts = Triplestore("rdflib") + else: + raise ValueError(f"Unsupported triplestore name: {tsname}") + + return ts + + +def test_graphdb_datadoc(): + """ + Test the dataset module using GraphDB. + """ + # Check if GraphDB is available and write a warning if it is not. + from tripper.utils import check_service_availability + + if not check_service_availability(GRAPHDB_CHECK_URL, timeout=1): + pytest.skip("GraphDB instance not available locally; skipping tests.") + + print("Testing graphdb") + datasettest("GraphDB") + + +def test_fuseki_datadoc(): + """ + Test the dataset module using Fuseki. + """ + # Check if Fuseki is available and write a warning if it is not. + from tripper.utils import check_service_availability + + if not check_service_availability(FUSEKI_CHECK_URL, timeout=1): + pytest.skip("Fuseki instance not available locally; skipping tests.") + + print("Testing fuseki") + datasettest("Fuseki") + + +def test_rdflib_datadoc(): + """ + Test the dataset module using rdflib. + """ + datasettest("rdflib") From e8c6e7602e2fd0c018ed42052a6fa09f02db4a5b Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Sun, 28 Sep 2025 17:04:14 +0200 Subject: [PATCH 08/19] Added substitute_query() function --- tests/test_utils.py | 33 +++++++++++++++++++ tripper/triplestore.py | 73 ++++++++++++++++++++++++++++++++++++++---- tripper/utils.py | 50 +++++++++++++++++++++++++++++ 3 files changed, 150 insertions(+), 6 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 3ff0772e..c14b452a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -525,6 +525,39 @@ def test_prefix_iri(): prefix_iri("xxx", prefixes, require_prefixed=True) +def test_substitute_query(): + """Test substitute_query().""" + from tripper import FOAF + from tripper.utils import substitute_query + + assert ( + substitute_query( + query="SELECT ?s WHERE { ?s $name $obj }", + iris={"name": "foaf:name"}, + literals={"obj": "John Dow"}, + prefixes={"foaf": str(FOAF)}, + ) + == f'SELECT ?s WHERE {{ ?s <{FOAF.name}> "John Dow" }}' + ) + + assert ( + substitute_query( + query="SELECT ?s WHERE { ?s $name $obj }", + iris={ + "name": ( + 'http://xmlns.com/foaf/0.1/name> "x" . ' + " <" + ) + }, + literals={"obj": 'John Dow" . "'}, + ) + ) == ( + "SELECT ?s WHERE { ?s " + r' "John Dow\" . \"" }' + ) + + def test_get_entry_points(): """Test get_entry_points()""" from tripper.utils import get_entry_points diff --git a/tripper/triplestore.py b/tripper/triplestore.py index 9d13968f..6e9317ab 100644 --- a/tripper/triplestore.py +++ b/tripper/triplestore.py @@ -54,6 +54,7 @@ infer_iri, prefix_iri, split_iri, + substitute_query, ) if TYPE_CHECKING: # pragma: no cover @@ -412,11 +413,32 @@ def serialize( ts.bind(prefix, iri) return ts.serialize(destination=destination, format=format, **kwargs) - def query(self, query_object, **kwargs) -> "Any": + def query( + self, + query: str, + iris: "Optional[dict]" = None, + literals: "Optional[dict]" = None, + **kwargs, + ) -> "Any": """SPARQL query. + The `query` argument may contain variables for IRIs and literals, + to be substituted using the `iris` and `literals` arguments. These + variables are prefixed `$`. This makes them easy to distinguish from + query variables, that are typically prefixed with `?`. + + The query substitutions may be useful when the query is constructed + from user input, since they are properly escaped and will be inserted + in the query as a single token. This may prevent sparql injection + attacks. + Arguments: - query_object: String with the SPARQL query. + query: String with the SPARQL query. + iris: Dict used for query substitutions that maps IRI variables + to IRIs. The IRIs may be provided as fully expanded or + prefixed with a prefix registered in the triplestore namespace. + literals: Dict used for query substitutions that maps literal + variables to literals. kwargs: Keyword arguments passed to the backend query() method. Returns: @@ -432,24 +454,63 @@ def query(self, query_object, **kwargs) -> "Any": Not all backends may support all types of queries. + Examples: + Query for everyone with the name "John Dow": + + >>> from tripper import FOAF, Literal, Triplestore + >>> ts = Triplestore(backend="rdflib") + >>> ts.bind("foaf", FOAF) + Namespace('http://xmlns.com/foaf/0.1/') + + >>> ts.add_triples([ + ... (":john", FOAF.name, Literal("John Dow")), + ... (":jack", FOAF.name, Literal("Jack Hudson")), + ... ]) + >>> ts.query( + ... "SELECT ?s WHERE { ?s $name $obj .}", + ... iris={"name": "foaf:name"}, + ... literals={"obj": "John Dow"}, + ... ) + [(':john',)] + """ self._check_method("query") - return self.backend.query(query_object=query_object, **kwargs) + new_query = substitute_query( + query, iris=iris, literals=literals, prefixes=self.namespaces + ) + return self.backend.query(new_query, **kwargs) - def update(self, update_object, **kwargs) -> None: + def update( + self, + query: str, + iris: "Optional[dict]" = None, + literals: "Optional[dict]" = None, + **kwargs, + ) -> None: """Update triplestore with SPARQL. Arguments: - update_object: String with the SPARQL query. + query: String with the SPARQL query. + iris: Dict used for query substitutions that maps IRI variables + to IRIs. The IRIs may be provided as fully expanded or + prefixed with a prefix registered in the triplestore namespace. + literals: Dict used for query substitutions that maps literal + variables to literals. kwargs: Keyword arguments passed to the backend update() method. Note: + See `query()` for how to the query substitution arguments `iris` + and `literals`. + This method is intended for INSERT and DELETE queries. Use the query() method for SELECT, ASK, CONSTRUCT and DESCRIBE queries. """ self._check_method("update") - return self.backend.update(update_object=update_object, **kwargs) + new_query = substitute_query( + query, iris=iris, literals=literals, prefixes=self.namespaces + ) + return self.backend.update(update_object=new_query, **kwargs) @overload def bind( diff --git a/tripper/utils.py b/tripper/utils.py index b1ac83f3..5decfc69 100644 --- a/tripper/utils.py +++ b/tripper/utils.py @@ -64,7 +64,9 @@ "extend_namespace", "expand_iri", "prefix_iri", + "substitute_query", "get_entry_points", + "check_service_availability", ) MATCH_PREFIXED_IRI = re.compile( @@ -732,6 +734,54 @@ def prefix_iri( return iri +def substitute_query( + query: str, + iris: "Optional[dict]" = None, + literals: "Optional[dict]" = None, + prefixes: "Optional[dict]" = None, +) -> "Any": + """Substitute IRI and literal variables in a SPARQL query. + + Arguments: + query: String with the SPARQL query. + iris: Dict used for query substitutions that maps IRI variables + to IRIs. The IRIs may be provided as fully expanded or + prefixed with the prefix defined in `prefixes`. + literals: Dict used for query substitutions that maps literal + variables to literals. For common datatypes, like strings + and numbers, the values can just be normal Python objects. + For special cases or more control, provide the values as + instances of `tripper.Literal`. + prefixes: Dict mapping prefixes to namespace URLs. + + Notes: + The `query` argument may contain variables for IRIs and literals, + to be substituted using the `iris` and `literals` arguments. These + variables are prefixed `$`. This makes them easy to distinguish from + query variables, that are typically prefixed with `?`. + + The query substitutions may be useful when the query is constructed + from user input, since they are properly escaped and will be inserted + in the query as a single token. This may prevent sparql injection + attacks. + """ + mapping = {} + + if iris: + if prefixes is None: + prefixes = {} + for k, v in iris.items(): + expanded = expand_iri(v, prefixes=prefixes) + quoted = urllib.parse.quote(expanded, safe=":~/?&;=#") + mapping[k] = f"<{quoted}>" + + if literals: + for k, v in literals.items(): + mapping[k] = Literal(v).n3() + + return string.Template(query).safe_substitute(mapping) + + def get_entry_points(group: str): """Consistent interface to entry points for the given group. From b5347682012513c2113aea777d7544018ec3f037 Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Sun, 28 Sep 2025 17:19:18 +0200 Subject: [PATCH 09/19] Update tripper/triplestore.py --- tripper/triplestore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tripper/triplestore.py b/tripper/triplestore.py index 6e9317ab..675bfc9f 100644 --- a/tripper/triplestore.py +++ b/tripper/triplestore.py @@ -510,7 +510,7 @@ def update( new_query = substitute_query( query, iris=iris, literals=literals, prefixes=self.namespaces ) - return self.backend.update(update_object=new_query, **kwargs) + return self.backend.update(new_query, **kwargs) @overload def bind( From 3957b8f5822b62193c54c10f2434bb21475ef6e6 Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Sun, 28 Sep 2025 19:11:42 +0200 Subject: [PATCH 10/19] Improved warning message --- tripper/datadoc/dataset.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tripper/datadoc/dataset.py b/tripper/datadoc/dataset.py index ddec8ef6..0de6e3bc 100644 --- a/tripper/datadoc/dataset.py +++ b/tripper/datadoc/dataset.py @@ -1203,7 +1203,10 @@ def make_query( if criterias is not None: warnings.warn( - "`criterias` is deprecated, use `criteria` instead", + ( + "`criterias` argument to make_query() is deprecated, use " + "the `criteria` instead" + ), category=DeprecationWarning, stacklevel=2, ) From 77f5b649375fa9015d8a89a91f569122c95ddb19 Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Sun, 28 Sep 2025 21:14:23 +0200 Subject: [PATCH 11/19] Added available() method and `check_url` to argument to Triplestore --- docs/session.md | 23 ++++-- .../test_sparqlwrapper_graphdb_fuseki.py | 77 +++++-------------- tests/input/session.yaml | 2 + tripper/triplestore.py | 29 +++++++ 4 files changed, 69 insertions(+), 62 deletions(-) diff --git a/docs/session.md b/docs/session.md index d1dc168b..00332fe3 100644 --- a/docs/session.md +++ b/docs/session.md @@ -11,18 +11,28 @@ The default location of this configuration file depends on the system: - Windows: `$HOME/AppData/Local/tripper/Config/session.yaml` - Darwin: `$HOME/Library/Config/tripper/session.yaml` -Add some default +The schema of the YAML file is simple. +A session should have a name that identifies it and should be followed by keyword arguments accepted by the `Triplestore` constructor. + +Here is an example of a possible session file: ``` +--- + +RdflibTest: + backend: rdflib + GraphDBTest: backend: sparqlwrapper base_iri: http://localhost:7200/repositories/test_repo update_iri: http://localhost:7200/repositories/test_repo/statements + check_url: http://localhost:7200/repositories FusekiTest: backend: sparqlwrapper base_iri: http://localhost:3030/test_repo update_iri: http://localhost:3030/test_repo/update + check_url: http://localhost:3030 username: admin password: admin0 @@ -30,15 +40,18 @@ MyKB: backend: sparqlwrapper base_iri: https://graphdb.myproject.eu/repositories/test_repo update_iri: https://graphdb.myproject.eu/repositories/test_repo/statements + check_url: https://graphdb.myproject.eu/repositories username: myname password: KEYRING ``` -The two first entries correspond to the GraphDB and Fuseki services -that can be started with docker as described in the [developers] -section. +The first entry is an in-memory rdflib backend. + +The second and third entries correspond to GraphDB and Fuseki services, +respectively. +These can be started with docker as described in the [developers] section. -The third entry is just a dummy example, showing how to use [keyring]. +The fourth entry is just a dummy example, showing how to use [keyring]. Each entry starts with the name identifying the configured triplestore. The keywords following it, correspond to the keyword arguments passed to the diff --git a/tests/backends/test_sparqlwrapper_graphdb_fuseki.py b/tests/backends/test_sparqlwrapper_graphdb_fuseki.py index 9d2b3dcc..46391250 100644 --- a/tests/backends/test_sparqlwrapper_graphdb_fuseki.py +++ b/tests/backends/test_sparqlwrapper_graphdb_fuseki.py @@ -5,57 +5,36 @@ https://emmc-asbl.github.io/tripper/latest/developers/. """ +from pathlib import Path + import pytest +from tripper import Session + pytest.importorskip("pyld") -# URL to check if GraphDB is running. -GRAPHDB_CHECK_URL = "http://localhost:7200/repositories" -FUSEKI_CHECK_URL = "http://localhost:3030" - - -def get_triplestore(tsname: str) -> "Triplestore": - """Help function that returns a new triplestore object.""" - from tripper import Triplestore - - if tsname == "GraphDB": - ts = Triplestore( - backend="sparqlwrapper", - base_iri="http://localhost:7200/repositories/test_repo", - update_iri=( - "http://localhost:7200/repositories/test_repo/statements" - ), - ) - elif tsname == "Fuseki": - ts = Triplestore( - backend="sparqlwrapper", - base_iri=f"{FUSEKI_CHECK_URL}/test_repo", - update_iri=f"{FUSEKI_CHECK_URL}/test_repo/update", - username="admin", - password="admin0", - ) - else: - raise ValueError(f"Unsupported triplestore name: {tsname}") - - return ts +thisdir = Path(__file__).resolve().parent +indir = thisdir.parent / "input" + +session = Session(config=indir / "session.yaml") # if True: -# tsname = "Fuseki" -def populate_and_search(tsname): # pylint: disable=too-many-statements +# sessionName = "FusekiTest" +def populate_and_search(sessionName): # pylint: disable=too-many-statements """Do the test on the desried backend.""" # pylint: disable=too-many-locals - from pathlib import Path - from tripper import Literal from tripper.datadoc import acquire, save_datadoc, search - thisdir = Path(__file__).resolve().parent + ts = session.get_triplestore(sessionName) + if not ts.available(timeout=1): + pytest.skip(f"{sessionName} service not available; skipping test.") + datasetinput = thisdir / "datadocumentation_sample.yaml" datasetinput2 = thisdir / "datadocumentation_sample2.yaml" - ts = get_triplestore(tsname) EX = ts.bind("ex", "http://www.example.org/") # Test DELETE query - clear the triplestore @@ -194,28 +173,12 @@ def populate_and_search(tsname): # pylint: disable=too-many-statements def test_graphdb(): - """ - Test the sparqlwrapper backend using GraphDB. - """ - # Check if GraphDB is available and write a warning if it is not. - from tripper.utils import check_service_availability - - if not check_service_availability(GRAPHDB_CHECK_URL, timeout=1): - pytest.skip("GraphDB instance not available locally; skipping tests.") - - print("Testing graphdb") - populate_and_search("GraphDB") + """Test the sparqlwrapper backend using GraphDB.""" + # Use service configured in tests/input/session.yaml + populate_and_search("GraphDBTest") def test_fuseki(): - """ - Test the sparqlwrapper backend using Fuseki. - """ - # Check if Fuseki is available and write a warning if it is not. - from tripper.utils import check_service_availability - - if not check_service_availability(FUSEKI_CHECK_URL, timeout=1): - pytest.skip("Fuseki instance not available locally; skipping tests.") - - print("Testing fuseki") - populate_and_search("Fuseki") + """Test the sparqlwrapper backend using Fuseki.""" + # Use service configured in tests/input/session.yaml + populate_and_search("FusekiTest") diff --git a/tests/input/session.yaml b/tests/input/session.yaml index d2a86d6e..782a97d8 100644 --- a/tests/input/session.yaml +++ b/tests/input/session.yaml @@ -5,6 +5,7 @@ FusekiTest: backend: sparqlwrapper base_iri: http://localhost:3030/test_repo update_iri: http://localhost:3030/test_repo/update + check_url: http://localhost:3030 username: admin password: admin0 @@ -12,3 +13,4 @@ GraphDBTest: backend: sparqlwrapper base_iri: http://localhost:7200/repositories/test_repo update_iri: http://localhost:7200/repositories/test_repo/statements + check_url: http://localhost:7200/repositories diff --git a/tripper/triplestore.py b/tripper/triplestore.py index 9d13968f..25c83c2a 100644 --- a/tripper/triplestore.py +++ b/tripper/triplestore.py @@ -47,6 +47,7 @@ ) from tripper.utils import ( bnode_iri, + check_service_availability, en, expand_iri, function_id, @@ -134,6 +135,7 @@ def __init__( base_iri: "Optional[str]" = None, database: "Optional[str]" = None, package: "Optional[str]" = None, + check_url: "Optional[str]" = None, **kwargs, ) -> None: """Initialise triplestore using the backend with the given name. @@ -159,6 +161,7 @@ def __init__( supports it). package: Required when `backend` is a relative module. In that case, it is relative to `package`. + check_url: A URL to use for checking that the backend is available. kwargs: Keyword arguments passed to the backend's __init__() method. @@ -170,6 +173,7 @@ def __init__( namespaces: Dict mapping namespace prefixes to IRIs. package: Name of Python package if the backend is implemented as a relative module. Assigned to the `package` argument. + check_url: The value of the `check_url` argument. Notes: If the backend establishes a connection that should be closed @@ -192,6 +196,7 @@ def __init__( self.backend_name = backend_name self.database = database self.package = package + self.check_url = check_url self.kwargs = kwargs.copy() self.backend = cls(base_iri=base_iri, database=database, **kwargs) @@ -1002,6 +1007,30 @@ def _get_restriction_dict(self, iri): "value": dct[p], } + def available(self, timeout=5, interval=1) -> bool: + """Checks if the backend is available. + + This is done by sending a request is send to the URL specified + in the `check_url` attribute and checking for the response. + + Arguments: + url: URL of the service to check. + timeout: Total time in seconds to wait for a respond. + interval: Interval for checking response. + + Returns: + Returns true if the service responds with code 200, + otherwise false is returned. + + """ + if self.check_url is None: + raise ValueError( + "`check_url` must be assigned before calling available()" + ) + return check_service_availability( + self.check_url, timeout=timeout, interval=interval + ) + def map( self, source: str, From bbcc3779e4721766a611c8582aa29783f8a31a72 Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Sun, 28 Sep 2025 21:23:14 +0200 Subject: [PATCH 12/19] Added documentation to session.yaml --- tests/input/session.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/input/session.yaml b/tests/input/session.yaml index 782a97d8..21b52d5d 100644 --- a/tests/input/session.yaml +++ b/tests/input/session.yaml @@ -1,3 +1,9 @@ +# Default sessions used for testing +# +# See https://emmc-asbl.github.io/tripper/latest/developers/ for how to set +# up local instances of GraphDB and Fuseki corresponding to the settings below. + + RdflibTest: backend: rdflib From 6e7d7677e749a6cdade2f83dcd8029b7418d1cbe Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Sun, 28 Sep 2025 21:36:49 +0200 Subject: [PATCH 13/19] Updated documentation --- docs/developers.md | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/docs/developers.md b/docs/developers.md index 39367104..23e879c8 100644 --- a/docs/developers.md +++ b/docs/developers.md @@ -4,12 +4,19 @@ See [interface.py], which defines the interface of a backend and may serve as a template for creating new backends. -### Developing the sparqlwrapper +## Setting up local GraphDB and Fuseki services for testing Tripper comes with an inbuilt backend to the SPARQLWrapper. In order to test this properly a real triplestore is needed. This is not done in the -automatic workflows on github. However, a local graphDB can be setup as described below and tested with test_sparqlwrapper_graphdb.py. +automatic workflows on github. However, local graphDB and Fuseki services +can be setup as described below and tested with +`tests/backends/test_sparqlwrapper_graphdb_fuseki.py`. +The backend configurations corresponding to the local GraphDB and Fuseki services +can be found in `[tests/input/session.yaml]`. + + +### Setting up GraphDB service To create the local instance of graphdb: ```bash docker pull ontotext/graphdb:10.8.3 # latest tag 17.02.2025 @@ -30,6 +37,7 @@ You can now run the test test_sparqlwrapper_graphdb_fuseki.py with graphdb. Note that if the graphdb instance is not found the test will just be skipped. +### Setting up Fuseki service Similarly a jena-fuseki instance can be tested locally as follows: ```bash @@ -37,7 +45,7 @@ docker pull stain/jena-fuseki docker run -d --name fuseki -p 3030:3030 -e ADMIN_PASSWORD=admin0 -e=FUSEKI_DATASET_1=test_repo stain/jena-fuseki ``` -You can now run the test test_sparqlwrapper_graphdb_fuseki.py with fuseki. +You can now run the test `test_sparqlwrapper_graphdb_fuseki.py` with fuseki. Note that if the fuseki instance is not found the test will just be skipped. @@ -75,3 +83,4 @@ Then open http://127.0.0.1:8000/tripper/ in your browser. [interface.py]: https://github.com/EMMC-ASBL/tripper/blob/master/tripper/interface.py [mkdocs]: https://www.mkdocs.org/ +[tests/input/session.yaml]: https://github.com/EMMC-ASBL/tripper/blob/master/tests/input/session.yaml) From cf3c77d917b7eb700a51a6cd43e8697c4965c972 Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Sun, 28 Sep 2025 22:02:02 +0200 Subject: [PATCH 14/19] Added argument annotations --- tests/backends/test_sparqlwrapper_graphdb_fuseki.py | 1 + tripper/triplestore.py | 2 +- tripper/utils.py | 4 +++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/backends/test_sparqlwrapper_graphdb_fuseki.py b/tests/backends/test_sparqlwrapper_graphdb_fuseki.py index 46391250..4d565b8a 100644 --- a/tests/backends/test_sparqlwrapper_graphdb_fuseki.py +++ b/tests/backends/test_sparqlwrapper_graphdb_fuseki.py @@ -20,6 +20,7 @@ # if True: +# sessionName = "GraphDBTest" # sessionName = "FusekiTest" def populate_and_search(sessionName): # pylint: disable=too-many-statements """Do the test on the desried backend.""" diff --git a/tripper/triplestore.py b/tripper/triplestore.py index 25c83c2a..82ae937f 100644 --- a/tripper/triplestore.py +++ b/tripper/triplestore.py @@ -1007,7 +1007,7 @@ def _get_restriction_dict(self, iri): "value": dct[p], } - def available(self, timeout=5, interval=1) -> bool: + def available(self, timeout: float = 5, interval: float = 1) -> bool: """Checks if the backend is available. This is done by sending a request is send to the URL specified diff --git a/tripper/utils.py b/tripper/utils.py index b1ac83f3..60d0833b 100644 --- a/tripper/utils.py +++ b/tripper/utils.py @@ -766,7 +766,9 @@ def get_entry_points(group: str): return eps -def check_service_availability(url: str, timeout=5, interval=1) -> bool: +def check_service_availability( + url: str, timeout: float = 5, interval: float = 1 +) -> bool: """Check whether the service with given URL is available. Arguments: From f318c65e17b2b5ef7b6b36a4d226be619e18791a Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Sun, 28 Sep 2025 22:26:12 +0200 Subject: [PATCH 15/19] Use session to configure test backends --- .../test_sparqlwrapper_graphdb_fuseki.py | 2 +- tests/datadoc/dataset_paths.py | 4 ++ tests/datadoc/test_dataset.py | 54 +++++++------------ 3 files changed, 24 insertions(+), 36 deletions(-) diff --git a/tests/backends/test_sparqlwrapper_graphdb_fuseki.py b/tests/backends/test_sparqlwrapper_graphdb_fuseki.py index 4d565b8a..eac9f54e 100644 --- a/tests/backends/test_sparqlwrapper_graphdb_fuseki.py +++ b/tests/backends/test_sparqlwrapper_graphdb_fuseki.py @@ -30,7 +30,7 @@ def populate_and_search(sessionName): # pylint: disable=too-many-statements from tripper.datadoc import acquire, save_datadoc, search ts = session.get_triplestore(sessionName) - if not ts.available(timeout=1): + if ts.check_url and not ts.available(timeout=1): pytest.skip(f"{sessionName} service not available; skipping test.") datasetinput = thisdir / "datadocumentation_sample.yaml" diff --git a/tests/datadoc/dataset_paths.py b/tests/datadoc/dataset_paths.py index 32b5b092..f813098b 100644 --- a/tests/datadoc/dataset_paths.py +++ b/tests/datadoc/dataset_paths.py @@ -6,8 +6,12 @@ from pathlib import Path +from tripper import Session + testdir = Path(__file__).absolute().parent.parent.resolve() rootdir = testdir.parent.resolve() ontodir = testdir / "ontologies" indir = testdir / "input" outdir = testdir / "output" + +session = Session(config=indir / "session.yaml") diff --git a/tests/datadoc/test_dataset.py b/tests/datadoc/test_dataset.py index 56d5c730..997ee192 100644 --- a/tests/datadoc/test_dataset.py +++ b/tests/datadoc/test_dataset.py @@ -1,16 +1,12 @@ """Test the dataset module.""" # pylint: disable=invalid-name,too-many-locals,duplicate-code - import pytest pytest.importorskip("yaml") pytest.importorskip("requests") pytest.importorskip("pyld") -GRAPHDB_CHECK_URL = "http://localhost:7200/repositories" -FUSEKI_CHECK_URL = "http://localhost:3030" - def test__get_range(): """Test _get_default_keywords().""" @@ -359,11 +355,15 @@ def test_update_classes(): } in r3["subClassOf"] -def datasettest(name): + +#sessionName = "FusekiTest" +sessionName = "RdflibTest" +if True: +#def datasettest(sessionName): """Test save_datadoc() and acquire()/store().""" # pylint: disable=too-many-statements - from dataset_paths import indir # pylint: disable=import-error + from dataset_paths import indir, session # pylint: disable=import-error from tripper import CHAMEO, DCAT, DCTERMS, EMMO, OTEIO from tripper.datadoc import acquire, save_datadoc, search, store @@ -372,7 +372,9 @@ def datasettest(name): pytest.importorskip("dlite") pytest.importorskip("rdflib") - ts = get_triplestore(name) + ts = session.get_triplestore(sessionName) + if ts.check_url and not ts.available(timeout=1): + pytest.skip(f"{sessionName} service not available; skipping test.") # Load data documentation into triplestore datadoc = save_datadoc(ts, indir / "semdata.yaml") @@ -792,36 +794,18 @@ def get_triplestore(tsname: str) -> "Triplestore": return ts -def test_graphdb_datadoc(): - """ - Test the dataset module using GraphDB. - """ - # Check if GraphDB is available and write a warning if it is not. - from tripper.utils import check_service_availability - - if not check_service_availability(GRAPHDB_CHECK_URL, timeout=1): - pytest.skip("GraphDB instance not available locally; skipping tests.") - - print("Testing graphdb") - datasettest("GraphDB") +# Use service configured in tests/input/session.yaml +def test_rdflib_datadoc(): + """Test the dataset module using rdflib.""" + datasettest("RdflibTest") -def test_fuseki_datadoc(): - """ - Test the dataset module using Fuseki. - """ - # Check if Fuseki is available and write a warning if it is not. - from tripper.utils import check_service_availability - - if not check_service_availability(FUSEKI_CHECK_URL, timeout=1): - pytest.skip("Fuseki instance not available locally; skipping tests.") - print("Testing fuseki") - datasettest("Fuseki") +def test_graphdb_datadoc(): + """Test the dataset module using GraphDB.""" + datasettest("GraphDBTest") -def test_rdflib_datadoc(): - """ - Test the dataset module using rdflib. - """ - datasettest("rdflib") +def test_fuseki_datadoc(): + """Test the dataset module using Fuseki.""" + datasettest("FusekiTest") From 9dddfb3afdcd7f374d9e66f6c3411dad3431aae8 Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Sun, 28 Sep 2025 22:27:37 +0200 Subject: [PATCH 16/19] Remove documentation of non-existing argument --- tripper/triplestore.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tripper/triplestore.py b/tripper/triplestore.py index 82ae937f..b1de0154 100644 --- a/tripper/triplestore.py +++ b/tripper/triplestore.py @@ -1014,7 +1014,6 @@ def available(self, timeout: float = 5, interval: float = 1) -> bool: in the `check_url` attribute and checking for the response. Arguments: - url: URL of the service to check. timeout: Total time in seconds to wait for a respond. interval: Interval for checking response. From 36ca3f5662150f01dec90469c2fd08b4c9bbf61a Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Sun, 28 Sep 2025 22:38:51 +0200 Subject: [PATCH 17/19] Added test --- tests/test_triplestore.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_triplestore.py b/tests/test_triplestore.py index a4ba5879..29108142 100644 --- a/tests/test_triplestore.py +++ b/tests/test_triplestore.py @@ -274,6 +274,18 @@ def test_restriction() -> None: # pylint: disable=too-many-statements ] +def test_availability(): + """Test availability().""" + # Already tested in backends/test_sparqlwrapper_graphdb_fuseki.py + # Just add test for missing `check_url` + pytest.importorskip("rdflib") + from tripper.triplestore import Triplestore + + ts = Triplestore("rdflib") + with pytest.raises(ValueError): + ts.available() + + def test_backend_rdflib(expected_function_triplestore: str) -> None: """Specifically test the rdflib backend Triplestore. From 865b374190f559011fe5a17eec2a6d4e52ad0a97 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 29 Sep 2025 05:48:16 +0000 Subject: [PATCH 18/19] [pre-commit.ci] auto fixes from pre-commit hooks For more information, see https://pre-commit.ci --- tests/datadoc/test_dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/datadoc/test_dataset.py b/tests/datadoc/test_dataset.py index 997ee192..673ea72f 100644 --- a/tests/datadoc/test_dataset.py +++ b/tests/datadoc/test_dataset.py @@ -355,11 +355,10 @@ def test_update_classes(): } in r3["subClassOf"] - -#sessionName = "FusekiTest" +# sessionName = "FusekiTest" sessionName = "RdflibTest" if True: -#def datasettest(sessionName): + # def datasettest(sessionName): """Test save_datadoc() and acquire()/store().""" # pylint: disable=too-many-statements @@ -796,6 +795,7 @@ def get_triplestore(tsname: str) -> "Triplestore": # Use service configured in tests/input/session.yaml + def test_rdflib_datadoc(): """Test the dataset module using rdflib.""" datasettest("RdflibTest") From 5e95c1876f12e58d5ee960b202de61159b70cd05 Mon Sep 17 00:00:00 2001 From: Jesper Friis Date: Mon, 29 Sep 2025 08:34:55 +0200 Subject: [PATCH 19/19] Added iriquote argument --- tests/test_utils.py | 5 +++++ tripper/utils.py | 22 ++++++++++++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index c14b452a..984b212c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -557,6 +557,11 @@ def test_substitute_query(): r' "John Dow\" . \"" }' ) + assert substitute_query("$x $y", iris={"x": "X"}) == " $y" + assert substitute_query("$x", iris={"x": "X"}, iriquote="[]") == "[X]" + assert substitute_query("$x", iris={"x": "X"}, iriquote=" ") == " X " + assert substitute_query("$x", iris={"x": "X"}, iriquote=None) == "X" + def test_get_entry_points(): """Test get_entry_points()""" diff --git a/tripper/utils.py b/tripper/utils.py index 5decfc69..c77b3e6c 100644 --- a/tripper/utils.py +++ b/tripper/utils.py @@ -10,6 +10,7 @@ import sys import tempfile import urllib +import warnings from contextlib import contextmanager from pathlib import Path from typing import TYPE_CHECKING @@ -739,6 +740,7 @@ def substitute_query( iris: "Optional[dict]" = None, literals: "Optional[dict]" = None, prefixes: "Optional[dict]" = None, + iriquote: str = "<>", ) -> "Any": """Substitute IRI and literal variables in a SPARQL query. @@ -753,6 +755,8 @@ def substitute_query( For special cases or more control, provide the values as instances of `tripper.Literal`. prefixes: Dict mapping prefixes to namespace URLs. + iriquote: Quote characters to use for IRIs. Should be a string of + length 2, with the start and end quote. Notes: The `query` argument may contain variables for IRIs and literals, @@ -765,15 +769,29 @@ def substitute_query( in the query as a single token. This may prevent sparql injection attacks. """ + safe = "-._~:/?#@+&;=" # special IRI characters that are not escaped mapping = {} + if iriquote: + if len(iriquote) == 1: + iriquote = iriquote[0] * 2 + elif len(iriquote) > 2: + raise ValueError( + f"`iriquote` cannot be more than 2 characters: '{iriquote}'" + ) + if iriquote[1].isalnum() or iriquote[1] in safe: + warnings.warn( + f"End quote '{iriquote[1]}' is alphanumeric or in '{safe}'" + ) + if iris: if prefixes is None: prefixes = {} for k, v in iris.items(): expanded = expand_iri(v, prefixes=prefixes) - quoted = urllib.parse.quote(expanded, safe=":~/?&;=#") - mapping[k] = f"<{quoted}>" + quoted = urllib.parse.quote(expanded, safe=safe) + q1, q2 = iriquote if iriquote else ("", "") # type: ignore[misc] + mapping[k] = f"{q1}{quoted}{q2}" if literals: for k, v in literals.items():