From 12f8d27de04a3e72751df9fe2541f8520b31649b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eray=20=C3=96zcan?= Date: Thu, 4 Sep 2025 15:13:50 +0200 Subject: [PATCH 01/16] fix: replace encodestring encodebytes for recent python versions compatibility --- src/oaipmh/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/oaipmh/client.py b/src/oaipmh/client.py index fc8dba5..2f0664e 100644 --- a/src/oaipmh/client.py +++ b/src/oaipmh/client.py @@ -335,7 +335,7 @@ def __init__(self, base_url, metadata_registry=None, credentials=None, self._local_file = local_file self._force_http_get = force_http_get if credentials is not None: - self._credentials = base64.encodestring('%s:%s' % credentials) + self._credentials = base64.encodebytes(credentials.encode()).decode() else: self._credentials = None From 847da20d0ba80e37c955a2c29090af166b23ef0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eray=20=C3=96zcan?= Date: Thu, 4 Sep 2025 16:37:58 +0200 Subject: [PATCH 02/16] fix: disable toolkit_description to prevent deprecated pkg_resources package error --- src/oaipmh/common.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/oaipmh/common.py b/src/oaipmh/common.py index c602ada..cec1437 100644 --- a/src/oaipmh/common.py +++ b/src/oaipmh/common.py @@ -1,5 +1,3 @@ -import pkg_resources - from oaipmh import error class Header(object): @@ -49,7 +47,7 @@ def getField(self, name): class Identify(object): def __init__(self, repositoryName, baseURL, protocolVersion, adminEmails, earliestDatestamp, deletedRecord, granularity, compression, - toolkit_description=True): + toolkit_description=False): self._repositoryName = repositoryName self._baseURL = baseURL self._protocolVersion = protocolVersion @@ -59,8 +57,10 @@ def __init__(self, repositoryName, baseURL, protocolVersion, adminEmails, self._granularity = granularity self._compression = compression self._descriptions = [] - + if toolkit_description: + import pkg_resources + req = pkg_resources.Requirement.parse('pyoai') egg = pkg_resources.working_set.find(req) if egg: @@ -77,7 +77,7 @@ def __init__(self, repositoryName, baseURL, protocolVersion, adminEmails, '%s' 'http://infrae.com/products/oaipack' '' % version) - + def repositoryName(self): return self._repositoryName From a485326c2c43e4509d6049f6721765b2c7272a29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eray=20=C3=96zcan?= Date: Thu, 4 Sep 2025 18:07:15 +0200 Subject: [PATCH 03/16] fix: replace deprecated evaluator with xpath --- src/oaipmh/client.py | 79 +++++++++++++++++++----------------------- src/oaipmh/metadata.py | 19 +++------- 2 files changed, 40 insertions(+), 58 deletions(-) diff --git a/src/oaipmh/client.py b/src/oaipmh/client.py index 2f0664e..eaeb762 100644 --- a/src/oaipmh/client.py +++ b/src/oaipmh/client.py @@ -12,7 +12,6 @@ import urllib2 from urllib import urlencode -import sys import base64 from lxml import etree import time @@ -142,12 +141,11 @@ def GetMetadata_impl(self, args, tree): def Identify_impl(self, args, tree): namespaces = self.getNamespaces() - evaluator = etree.XPathEvaluator(tree, namespaces=namespaces) - identify_node = evaluator.evaluate( - '/oai:OAI-PMH/oai:Identify')[0] - identify_evaluator = etree.XPathEvaluator(identify_node, - namespaces=namespaces) - e = identify_evaluator.evaluate + identify_node = tree.xpath( + '/oai:OAI-PMH/oai:Identify', + namespaces=namespaces + )[0] + e = identify_node.xpath repositoryName = e('string(oai:repositoryName/text())') baseURL = e('string(oai:baseURL/text())') @@ -177,15 +175,13 @@ def nextBatch(token): def ListMetadataFormats_impl(self, args, tree): namespaces = self.getNamespaces() - evaluator = etree.XPathEvaluator(tree, - namespaces=namespaces) - - metadataFormat_nodes = evaluator.evaluate( - '/oai:OAI-PMH/oai:ListMetadataFormats/oai:metadataFormat') + metadataFormat_nodes = tree.xpath( + '/oai:OAI-PMH/oai:ListMetadataFormats/oai:metadataFormat', + namespaces=namespaces + ) metadataFormats = [] for metadataFormat_node in metadataFormat_nodes: - e = etree.XPathEvaluator(metadataFormat_node, - namespaces=namespaces).evaluate + e = metadataFormat_node.xpath metadataPrefix = e('string(oai:metadataPrefix/text())') schema = e('string(oai:schema/text())') metadataNamespace = e('string(oai:metadataNamespace/text())') @@ -224,22 +220,17 @@ def nextBatch(token): # various helper methods - def buildRecords(self, - metadata_prefix, namespaces, metadata_registry, tree): - # first find resumption token if available - evaluator = etree.XPathEvaluator(tree, - namespaces=namespaces) - token = evaluator.evaluate( - 'string(/oai:OAI-PMH/*/oai:resumptionToken/text())') + def buildRecords(self, metadata_prefix, namespaces, metadata_registry, tree): + token = tree.xpath( + 'string(/oai:OAI-PMH/*/oai:resumptionToken/text())', + namespaces=namespaces + ) if token.strip() == '': token = None - record_nodes = evaluator.evaluate( - '/oai:OAI-PMH/*/oai:record') + record_nodes = tree.xpath('/oai:OAI-PMH/*/oai:record', namespaces=namespaces) result = [] for record_node in record_nodes: - record_evaluator = etree.XPathEvaluator(record_node, - namespaces=namespaces) - e = record_evaluator.evaluate + e = record_node.xpath # find header node header_node = e('oai:header')[0] # create header @@ -258,16 +249,17 @@ def buildRecords(self, return result, token def buildIdentifiers(self, namespaces, tree): - evaluator = etree.XPathEvaluator(tree, - namespaces=namespaces) - # first find resumption token is available - token = evaluator.evaluate( - 'string(/oai:OAI-PMH/*/oai:resumptionToken/text())') #'string(/oai:OAI-PMH/oai:ListIdentifiers/oai:resumptionToken/text())') + token = tree.xpath( + 'string(/oai:OAI-PMH/*/oai:resumptionToken/text())', + namespaces=namespaces + ) if token.strip() == '': token = None - header_nodes = evaluator.evaluate( - '/oai:OAI-PMH/oai:ListIdentifiers/oai:header') + header_nodes = tree.xpath( + '/oai:OAI-PMH/oai:ListIdentifiers/oai:header', + namespaces=namespaces + ) result = [] for header_node in header_nodes: header = buildHeader(header_node, namespaces) @@ -275,19 +267,19 @@ def buildIdentifiers(self, namespaces, tree): return result, token def buildSets(self, namespaces, tree): - evaluator = etree.XPathEvaluator(tree, - namespaces=namespaces) - # first find resumption token if available - token = evaluator.evaluate( - 'string(/oai:OAI-PMH/oai:ListSets/oai:resumptionToken/text())') + token = tree.xpath( + 'string(/oai:OAI-PMH/oai:ListSets/oai:resumptionToken/text())', + namespaces=namespaces + ) if token.strip() == '': token = None - set_nodes = evaluator.evaluate( - '/oai:OAI-PMH/oai:ListSets/oai:set') + set_nodes = tree.xpath( + '/oai:OAI-PMH/oai:ListSets/oai:set', + namespaces=namespaces + ) sets = [] for set_node in set_nodes: - e = etree.XPathEvaluator(set_node, - namespaces=namespaces).evaluate + e = set_node.xpath # make sure we get back unicode strings instead # of lxml.etree._ElementUnicodeResult objects. setSpec = six.text_type(e('string(oai:setSpec/text())')) @@ -367,8 +359,7 @@ def makeRequest(self, **kw): ) def buildHeader(header_node, namespaces): - e = etree.XPathEvaluator(header_node, - namespaces=namespaces).evaluate + e = header_node.xpath identifier = e('string(oai:identifier/text())') datestamp = datestamp_to_datetime( str(e('string(oai:datestamp/text())'))) diff --git a/src/oaipmh/metadata.py b/src/oaipmh/metadata.py index 14d9ad0..e837b2a 100644 --- a/src/oaipmh/metadata.py +++ b/src/oaipmh/metadata.py @@ -1,7 +1,5 @@ import sys -from lxml import etree -from lxml.etree import SubElement from oaipmh import common if sys.version_info[0] == 3: @@ -21,7 +19,7 @@ class MetadataRegistry(object): def __init__(self): self._readers = {} self._writers = {} - + def registerReader(self, metadata_prefix, reader): self._readers[metadata_prefix] = reader @@ -30,10 +28,10 @@ def registerWriter(self, metadata_prefix, writer): def hasReader(self, metadata_prefix): return metadata_prefix in self._readers - + def hasWriter(self, metadata_prefix): return metadata_prefix in self._writers - + def readMetadata(self, metadata_prefix, element): """Turn XML into metadata object. @@ -45,7 +43,7 @@ def readMetadata(self, metadata_prefix, element): def writeMetadata(self, metadata_prefix, element, metadata): """Write metadata as XML. - + element - ElementTree element to write under metadata - metadata object to write """ @@ -65,11 +63,7 @@ def __init__(self, fields, namespaces=None): def __call__(self, element): map = {} - # create XPathEvaluator for this element - xpath_evaluator = etree.XPathEvaluator(element, - namespaces=self._namespaces) - - e = xpath_evaluator.evaluate + e = element.xpath # now extra field info according to xpath expr for field_name, (field_type, expr) in list(self._fields.items()): if field_type == 'bytes': @@ -111,6 +105,3 @@ def __call__(self, element): 'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/', 'dc' : 'http://purl.org/dc/elements/1.1/'} ) - - - From c632816a8fc15a48e916d41f9f2098d5cb1e3afb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eray=20=C3=96zcan?= Date: Thu, 4 Sep 2025 18:22:13 +0200 Subject: [PATCH 04/16] fix: add missing namespaces --- src/oaipmh/client.py | 38 +++++++++++++++++++------------------- src/oaipmh/metadata.py | 8 ++++---- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/oaipmh/client.py b/src/oaipmh/client.py index eaeb762..22e9a30 100644 --- a/src/oaipmh/client.py +++ b/src/oaipmh/client.py @@ -147,15 +147,15 @@ def Identify_impl(self, args, tree): )[0] e = identify_node.xpath - repositoryName = e('string(oai:repositoryName/text())') - baseURL = e('string(oai:baseURL/text())') - protocolVersion = e('string(oai:protocolVersion/text())') - adminEmails = e('oai:adminEmail/text()') + repositoryName = e('string(oai:repositoryName/text())', namespaces=namespaces) + baseURL = e('string(oai:baseURL/text())', namespaces=namespaces) + protocolVersion = e('string(oai:protocolVersion/text())', namespaces=namespaces) + adminEmails = e('oai:adminEmail/text()', namespaces=namespaces) earliestDatestamp = datestamp_to_datetime( - e('string(oai:earliestDatestamp/text())')) - deletedRecord = e('string(oai:deletedRecord/text())') - granularity = e('string(oai:granularity/text())') - compression = e('oai:compression/text()') + e('string(oai:earliestDatestamp/text())', namespaces=namespaces)) + deletedRecord = e('string(oai:deletedRecord/text())', namespaces=namespaces) + granularity = e('string(oai:granularity/text())', namespaces=namespaces) + compression = e('oai:compression/text()', namespaces=namespaces) # XXX description identify = common.Identify( repositoryName, baseURL, protocolVersion, @@ -182,9 +182,9 @@ def ListMetadataFormats_impl(self, args, tree): metadataFormats = [] for metadataFormat_node in metadataFormat_nodes: e = metadataFormat_node.xpath - metadataPrefix = e('string(oai:metadataPrefix/text())') - schema = e('string(oai:schema/text())') - metadataNamespace = e('string(oai:metadataNamespace/text())') + metadataPrefix = e('string(oai:metadataPrefix/text())', namespaces=namespaces) + schema = e('string(oai:schema/text())', namespaces=namespaces) + metadataNamespace = e('string(oai:metadataNamespace/text())', namespaces=namespaces) metadataFormat = (metadataPrefix, schema, metadataNamespace) metadataFormats.append(metadataFormat) @@ -232,11 +232,11 @@ def buildRecords(self, metadata_prefix, namespaces, metadata_registry, tree): for record_node in record_nodes: e = record_node.xpath # find header node - header_node = e('oai:header')[0] + header_node = e('oai:header', namespaces=namespaces)[0] # create header header = buildHeader(header_node, namespaces) # find metadata node - metadata_list = e('oai:metadata') + metadata_list = e('oai:metadata', namespaces=namespaces) if metadata_list: metadata_node = metadata_list[0] # create metadata @@ -282,8 +282,8 @@ def buildSets(self, namespaces, tree): e = set_node.xpath # make sure we get back unicode strings instead # of lxml.etree._ElementUnicodeResult objects. - setSpec = six.text_type(e('string(oai:setSpec/text())')) - setName = six.text_type(e('string(oai:setName/text())')) + setSpec = six.text_type(e('string(oai:setSpec/text())', namespaces=namespaces)) + setName = six.text_type(e('string(oai:setName/text())', namespaces=namespaces)) # XXX setDescription nodes sets.append((setSpec, setName, None)) return sets, token @@ -360,11 +360,11 @@ def makeRequest(self, **kw): def buildHeader(header_node, namespaces): e = header_node.xpath - identifier = e('string(oai:identifier/text())') + identifier = e('string(oai:identifier/text())', namespaces=namespaces) datestamp = datestamp_to_datetime( - str(e('string(oai:datestamp/text())'))) - setspec = [str(s) for s in e('oai:setSpec/text()')] - deleted = e("@status = 'deleted'") + str(e('string(oai:datestamp/text())', namespaces=namespaces))) + setspec = [str(s) for s in e('oai:setSpec/text()', namespaces=namespaces)] + deleted = e("@status = 'deleted'", namespaces=namespaces) return common.Header(header_node, identifier, datestamp, setspec, deleted) def ResumptionListGenerator(firstBatch, nextBatch): diff --git a/src/oaipmh/metadata.py b/src/oaipmh/metadata.py index e837b2a..2cee78f 100644 --- a/src/oaipmh/metadata.py +++ b/src/oaipmh/metadata.py @@ -67,17 +67,17 @@ def __call__(self, element): # now extra field info according to xpath expr for field_name, (field_type, expr) in list(self._fields.items()): if field_type == 'bytes': - value = str(e(expr)) + value = str(e(expr, namespace=self._namespaces)) elif field_type == 'bytesList': - value = [str(item) for item in e(expr)] + value = [str(item) for item in e(expr, namespace=self._namespaces)] elif field_type == 'text': # make sure we get back unicode strings instead # of lxml.etree._ElementUnicodeResult objects. - value = text_type(e(expr)) + value = text_type(e(expr, namespace=self._namespaces)) elif field_type == 'textList': # make sure we get back unicode strings instead # of lxml.etree._ElementUnicodeResult objects. - value = [text_type(v) for v in e(expr)] + value = [text_type(v) for v in e(expr, namespace=self._namespaces)] else: raise Error("Unknown field type: %s" % field_type) map[field_name] = value From 068e0b341eefa50b558adec4df1091ae0bbe76a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eray=20=C3=96zcan?= Date: Thu, 4 Sep 2025 19:07:42 +0200 Subject: [PATCH 05/16] debug: convert dict result to list of dict --- src/oaipmh/metadata.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/oaipmh/metadata.py b/src/oaipmh/metadata.py index 2cee78f..21c6f42 100644 --- a/src/oaipmh/metadata.py +++ b/src/oaipmh/metadata.py @@ -78,6 +78,23 @@ def __call__(self, element): # make sure we get back unicode strings instead # of lxml.etree._ElementUnicodeResult objects. value = [text_type(v) for v in e(expr, namespace=self._namespaces)] + elif field_type == 'textList': + # Make sure we get back unicode strings instead + # of lxml.etree._ElementUnicodeResult objects. + + # Run the XPath query and get the result + result = e(expr, namespace=self._namespaces) + + # Check if the result is a list. If not, treat it as a single item. + if isinstance(result, list): + # The result is a list, so iterate and convert each element + value = [text_type(v) for v in result] + elif result is not None: + # The result is a single value, so wrap it in a list + value = [text_type(result)] + else: + # The result is None (e.g., no match), so return an empty list + value = [] else: raise Error("Unknown field type: %s" % field_type) map[field_name] = value From 87652833c99cf1bda83632a16e0eb82ad33132a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eray=20=C3=96zcan?= Date: Thu, 4 Sep 2025 19:10:23 +0200 Subject: [PATCH 06/16] debug: remove wrong condition --- src/oaipmh/metadata.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/oaipmh/metadata.py b/src/oaipmh/metadata.py index 21c6f42..d6073b0 100644 --- a/src/oaipmh/metadata.py +++ b/src/oaipmh/metadata.py @@ -74,10 +74,6 @@ def __call__(self, element): # make sure we get back unicode strings instead # of lxml.etree._ElementUnicodeResult objects. value = text_type(e(expr, namespace=self._namespaces)) - elif field_type == 'textList': - # make sure we get back unicode strings instead - # of lxml.etree._ElementUnicodeResult objects. - value = [text_type(v) for v in e(expr, namespace=self._namespaces)] elif field_type == 'textList': # Make sure we get back unicode strings instead # of lxml.etree._ElementUnicodeResult objects. From 3711a0a24b82eaee47e1bddfae1103253ec7c806 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eray=20=C3=96zcan?= Date: Thu, 4 Sep 2025 19:12:14 +0200 Subject: [PATCH 07/16] debug: remove namespace --- src/oaipmh/metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/oaipmh/metadata.py b/src/oaipmh/metadata.py index d6073b0..0827992 100644 --- a/src/oaipmh/metadata.py +++ b/src/oaipmh/metadata.py @@ -79,7 +79,7 @@ def __call__(self, element): # of lxml.etree._ElementUnicodeResult objects. # Run the XPath query and get the result - result = e(expr, namespace=self._namespaces) + result = e(expr) # Check if the result is a list. If not, treat it as a single item. if isinstance(result, list): From 4709154aeadeac3cbfe90dbdfc03ba4e6d1de9a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eray=20=C3=96zcan?= Date: Thu, 4 Sep 2025 19:16:45 +0200 Subject: [PATCH 08/16] Revert "debug: remove namespace" This reverts commit 3711a0a24b82eaee47e1bddfae1103253ec7c806. --- src/oaipmh/metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/oaipmh/metadata.py b/src/oaipmh/metadata.py index 0827992..d6073b0 100644 --- a/src/oaipmh/metadata.py +++ b/src/oaipmh/metadata.py @@ -79,7 +79,7 @@ def __call__(self, element): # of lxml.etree._ElementUnicodeResult objects. # Run the XPath query and get the result - result = e(expr) + result = e(expr, namespace=self._namespaces) # Check if the result is a list. If not, treat it as a single item. if isinstance(result, list): From 2c6ef742ed4ab4abaea5dc4c1c0eef768426d240 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eray=20=C3=96zcan?= Date: Thu, 4 Sep 2025 19:16:54 +0200 Subject: [PATCH 09/16] Revert "debug: remove wrong condition" This reverts commit 87652833c99cf1bda83632a16e0eb82ad33132a7. --- src/oaipmh/metadata.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/oaipmh/metadata.py b/src/oaipmh/metadata.py index d6073b0..21c6f42 100644 --- a/src/oaipmh/metadata.py +++ b/src/oaipmh/metadata.py @@ -74,6 +74,10 @@ def __call__(self, element): # make sure we get back unicode strings instead # of lxml.etree._ElementUnicodeResult objects. value = text_type(e(expr, namespace=self._namespaces)) + elif field_type == 'textList': + # make sure we get back unicode strings instead + # of lxml.etree._ElementUnicodeResult objects. + value = [text_type(v) for v in e(expr, namespace=self._namespaces)] elif field_type == 'textList': # Make sure we get back unicode strings instead # of lxml.etree._ElementUnicodeResult objects. From f45ac23a414748c621cbfdac404482b86a60c185 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eray=20=C3=96zcan?= Date: Thu, 4 Sep 2025 19:17:02 +0200 Subject: [PATCH 10/16] Revert "debug: convert dict result to list of dict" This reverts commit 068e0b341eefa50b558adec4df1091ae0bbe76a0. --- src/oaipmh/metadata.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/src/oaipmh/metadata.py b/src/oaipmh/metadata.py index 21c6f42..2cee78f 100644 --- a/src/oaipmh/metadata.py +++ b/src/oaipmh/metadata.py @@ -78,23 +78,6 @@ def __call__(self, element): # make sure we get back unicode strings instead # of lxml.etree._ElementUnicodeResult objects. value = [text_type(v) for v in e(expr, namespace=self._namespaces)] - elif field_type == 'textList': - # Make sure we get back unicode strings instead - # of lxml.etree._ElementUnicodeResult objects. - - # Run the XPath query and get the result - result = e(expr, namespace=self._namespaces) - - # Check if the result is a list. If not, treat it as a single item. - if isinstance(result, list): - # The result is a list, so iterate and convert each element - value = [text_type(v) for v in result] - elif result is not None: - # The result is a single value, so wrap it in a list - value = [text_type(result)] - else: - # The result is None (e.g., no match), so return an empty list - value = [] else: raise Error("Unknown field type: %s" % field_type) map[field_name] = value From adb667a2f59167c5e156920c4862d44ed1d956e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eray=20=C3=96zcan?= Date: Thu, 4 Sep 2025 19:18:09 +0200 Subject: [PATCH 11/16] debug: raise exception --- src/oaipmh/metadata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/oaipmh/metadata.py b/src/oaipmh/metadata.py index 2cee78f..da0ff79 100644 --- a/src/oaipmh/metadata.py +++ b/src/oaipmh/metadata.py @@ -77,6 +77,7 @@ def __call__(self, element): elif field_type == 'textList': # make sure we get back unicode strings instead # of lxml.etree._ElementUnicodeResult objects. + raise Exception(f"{element} - {expr} - {self._namespaces}") value = [text_type(v) for v in e(expr, namespace=self._namespaces)] else: raise Error("Unknown field type: %s" % field_type) From dea8c92f56fb183e72d35553293319f472579738 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eray=20=C3=96zcan?= Date: Thu, 4 Sep 2025 19:25:27 +0200 Subject: [PATCH 12/16] debug: test new __call__ method --- src/oaipmh/metadata.py | 54 +++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/src/oaipmh/metadata.py b/src/oaipmh/metadata.py index da0ff79..2015f11 100644 --- a/src/oaipmh/metadata.py +++ b/src/oaipmh/metadata.py @@ -63,25 +63,45 @@ def __init__(self, fields, namespaces=None): def __call__(self, element): map = {} + # Alias for element.xpath e = element.xpath - # now extra field info according to xpath expr for field_name, (field_type, expr) in list(self._fields.items()): - if field_type == 'bytes': - value = str(e(expr, namespace=self._namespaces)) - elif field_type == 'bytesList': - value = [str(item) for item in e(expr, namespace=self._namespaces)] - elif field_type == 'text': - # make sure we get back unicode strings instead - # of lxml.etree._ElementUnicodeResult objects. - value = text_type(e(expr, namespace=self._namespaces)) - elif field_type == 'textList': - # make sure we get back unicode strings instead - # of lxml.etree._ElementUnicodeResult objects. - raise Exception(f"{element} - {expr} - {self._namespaces}") - value = [text_type(v) for v in e(expr, namespace=self._namespaces)] - else: - raise Error("Unknown field type: %s" % field_type) - map[field_name] = value + try: + # The core logic is to safely handle the result from xpath() + raw_result = e(expr, namespaces=self._namespaces) + + value = None + if field_type == 'bytes': + value = str(raw_result) + elif field_type == 'bytesList': + # Ensure the result is iterable before the list comprehension + value = [str(item) for item in (raw_result if isinstance(raw_result, list) else [raw_result])] + elif field_type == 'text': + value = text_type(raw_result) + elif field_type == 'textList': + # This is the critical part to fix the error + if isinstance(raw_result, list): + # This handles the expected case: a list of elements/strings + value = [text_type(v) for v in raw_result] + elif raw_result is not None: + # This handles a single value being returned + value = [text_type(raw_result)] + else: + # Handles cases with no result (None) + value = [] + else: + raise Error("Unknown field type: %s" % field_type) + + map[field_name] = value + + except Exception as ex: + # A robust way to prevent crashes + print(f"Warning: Error processing field '{field_name}' with expression '{expr}': {ex}", file=sys.stderr) + if field_type.endswith('List'): + map[field_name] = [] + else: + map[field_name] = "" + return common.Metadata(element, map) oai_dc_reader = MetadataReader( From 775909d6fcb9a41c0d250d4d0907b59475fe3f24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eray=20=C3=96zcan?= Date: Thu, 4 Sep 2025 19:28:00 +0200 Subject: [PATCH 13/16] Revert "debug: test new __call__ method" This reverts commit dea8c92f56fb183e72d35553293319f472579738. --- src/oaipmh/metadata.py | 54 +++++++++++++----------------------------- 1 file changed, 17 insertions(+), 37 deletions(-) diff --git a/src/oaipmh/metadata.py b/src/oaipmh/metadata.py index 2015f11..da0ff79 100644 --- a/src/oaipmh/metadata.py +++ b/src/oaipmh/metadata.py @@ -63,45 +63,25 @@ def __init__(self, fields, namespaces=None): def __call__(self, element): map = {} - # Alias for element.xpath e = element.xpath + # now extra field info according to xpath expr for field_name, (field_type, expr) in list(self._fields.items()): - try: - # The core logic is to safely handle the result from xpath() - raw_result = e(expr, namespaces=self._namespaces) - - value = None - if field_type == 'bytes': - value = str(raw_result) - elif field_type == 'bytesList': - # Ensure the result is iterable before the list comprehension - value = [str(item) for item in (raw_result if isinstance(raw_result, list) else [raw_result])] - elif field_type == 'text': - value = text_type(raw_result) - elif field_type == 'textList': - # This is the critical part to fix the error - if isinstance(raw_result, list): - # This handles the expected case: a list of elements/strings - value = [text_type(v) for v in raw_result] - elif raw_result is not None: - # This handles a single value being returned - value = [text_type(raw_result)] - else: - # Handles cases with no result (None) - value = [] - else: - raise Error("Unknown field type: %s" % field_type) - - map[field_name] = value - - except Exception as ex: - # A robust way to prevent crashes - print(f"Warning: Error processing field '{field_name}' with expression '{expr}': {ex}", file=sys.stderr) - if field_type.endswith('List'): - map[field_name] = [] - else: - map[field_name] = "" - + if field_type == 'bytes': + value = str(e(expr, namespace=self._namespaces)) + elif field_type == 'bytesList': + value = [str(item) for item in e(expr, namespace=self._namespaces)] + elif field_type == 'text': + # make sure we get back unicode strings instead + # of lxml.etree._ElementUnicodeResult objects. + value = text_type(e(expr, namespace=self._namespaces)) + elif field_type == 'textList': + # make sure we get back unicode strings instead + # of lxml.etree._ElementUnicodeResult objects. + raise Exception(f"{element} - {expr} - {self._namespaces}") + value = [text_type(v) for v in e(expr, namespace=self._namespaces)] + else: + raise Error("Unknown field type: %s" % field_type) + map[field_name] = value return common.Metadata(element, map) oai_dc_reader = MetadataReader( From 1b71260d795354fa3b8f696ef66c438e0e366f3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eray=20=C3=96zcan?= Date: Thu, 4 Sep 2025 19:28:51 +0200 Subject: [PATCH 14/16] fix: metadata reader --- src/oaipmh/metadata.py | 53 ++++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/src/oaipmh/metadata.py b/src/oaipmh/metadata.py index da0ff79..0b90f68 100644 --- a/src/oaipmh/metadata.py +++ b/src/oaipmh/metadata.py @@ -63,25 +63,44 @@ def __init__(self, fields, namespaces=None): def __call__(self, element): map = {} + # Alias for element.xpath e = element.xpath - # now extra field info according to xpath expr for field_name, (field_type, expr) in list(self._fields.items()): - if field_type == 'bytes': - value = str(e(expr, namespace=self._namespaces)) - elif field_type == 'bytesList': - value = [str(item) for item in e(expr, namespace=self._namespaces)] - elif field_type == 'text': - # make sure we get back unicode strings instead - # of lxml.etree._ElementUnicodeResult objects. - value = text_type(e(expr, namespace=self._namespaces)) - elif field_type == 'textList': - # make sure we get back unicode strings instead - # of lxml.etree._ElementUnicodeResult objects. - raise Exception(f"{element} - {expr} - {self._namespaces}") - value = [text_type(v) for v in e(expr, namespace=self._namespaces)] - else: - raise Error("Unknown field type: %s" % field_type) - map[field_name] = value + try: + # The core logic is to safely handle the result from xpath() + raw_result = e(expr, namespaces=self._namespaces) + + value = None + if field_type == 'bytes': + value = str(raw_result) + elif field_type == 'bytesList': + # Ensure the result is iterable before the list comprehension + value = [str(item) for item in (raw_result if isinstance(raw_result, list) else [raw_result])] + elif field_type == 'text': + value = text_type(raw_result) + elif field_type == 'textList': + # This is the critical part to fix the error + if isinstance(raw_result, list): + # This handles the expected case: a list of elements/strings + value = [text_type(v) for v in raw_result] + elif raw_result is not None: + # This handles a single value being returned + value = [text_type(raw_result)] + else: + # Handles cases with no result (None) + value = [] + else: + raise Error("Unknown field type: %s" % field_type) + + map[field_name] = value + except Exception as ex: + # A robust way to prevent crashes + print(f"Warning: Error processing field '{field_name}' with expression '{expr}': {ex}", file=sys.stderr) + if field_type.endswith('List'): + map[field_name] = [] + else: + map[field_name] = "" + return common.Metadata(element, map) oai_dc_reader = MetadataReader( From 5fa8b041410c591bdb51b37a98a9b1edc9b77ce5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eray=20=C3=96zcan?= Date: Fri, 5 Sep 2025 10:56:44 +0200 Subject: [PATCH 15/16] feat: support processing raw data --- src/oaipmh/client.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/src/oaipmh/client.py b/src/oaipmh/client.py index 22e9a30..e7ed7de 100644 --- a/src/oaipmh/client.py +++ b/src/oaipmh/client.py @@ -319,22 +319,32 @@ def makeRequest(self, **kw): class Client(BaseClient): - def __init__(self, base_url, metadata_registry=None, credentials=None, - local_file=False, force_http_get=False, custom_retry_policy=None): - BaseClient.__init__(self, metadata_registry, - custom_retry_policy=custom_retry_policy) + def __init__( + self, + base_url, + metadata_registry=None, + credentials=None, + local_file=False, + force_http_get=False, + custom_retry_policy=None, + raw_data=None, + ): + BaseClient.__init__( + self, metadata_registry, custom_retry_policy=custom_retry_policy + ) self._base_url = base_url self._local_file = local_file self._force_http_get = force_http_get + self._raw_data = raw_data if credentials is not None: self._credentials = base64.encodebytes(credentials.encode()).decode() else: self._credentials = None def makeRequest(self, **kw): - """Either load a local XML file or actually retrieve XML from a server. - """ - if self._local_file: + if isinstance(self._raw_data, str): + return self._raw_data.encode('ascii', 'replace') + elif self._local_file: with codecs.open(self._base_url, 'r', 'utf-8') as xmlfile: text = xmlfile.read() return text.encode('ascii', 'replace') From 4c9852c05404191abee798bfe8e33c4039b0c955 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eray=20=C3=96zcan?= Date: Mon, 2 Mar 2026 17:33:08 +0100 Subject: [PATCH 16/16] fix: ignore resumptionToken in listRecords if raw_data --- src/oaipmh/client.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/oaipmh/client.py b/src/oaipmh/client.py index e7ed7de..baf8a7a 100644 --- a/src/oaipmh/client.py +++ b/src/oaipmh/client.py @@ -39,7 +39,7 @@ class BaseClient(common.OAIPMH): 'expected-errcodes': {503}, } - def __init__(self, metadata_registry=None, custom_retry_policy=None): + def __init__(self, metadata_registry=None, custom_retry_policy=None, raw_data=None): self._metadata_registry = ( metadata_registry or metadata.global_metadata_registry) self._ignore_bad_character_hack = 0 @@ -47,6 +47,7 @@ def __init__(self, metadata_registry=None, custom_retry_policy=None): self.retry_policy = self.default_retry_policy.copy() if custom_retry_policy is not None: self.retry_policy.update(custom_retry_policy) + self._raw_data = raw_data def updateGranularity(self): """Update the granularity setting dependent on that the server says. @@ -225,7 +226,7 @@ def buildRecords(self, metadata_prefix, namespaces, metadata_registry, tree): 'string(/oai:OAI-PMH/*/oai:resumptionToken/text())', namespaces=namespaces ) - if token.strip() == '': + if token.strip() == '' or self._raw_data: token = None record_nodes = tree.xpath('/oai:OAI-PMH/*/oai:record', namespaces=namespaces) result = [] @@ -330,12 +331,11 @@ def __init__( raw_data=None, ): BaseClient.__init__( - self, metadata_registry, custom_retry_policy=custom_retry_policy + self, metadata_registry, custom_retry_policy=custom_retry_policy, raw_data=raw_data ) self._base_url = base_url self._local_file = local_file self._force_http_get = force_http_get - self._raw_data = raw_data if credentials is not None: self._credentials = base64.encodebytes(credentials.encode()).decode() else: