Multiple doc types#732
Conversation
241a5e4 to
ad92c3a
Compare
ad92c3a to
afa86dc
Compare
| del new_version_entry["pids"] | ||
|
|
||
| # Preserve existing programmes in new versions | ||
| existing_programmes = ( |
There was a problem hiding this comment.
check if the custom fields are preserved between versions
3062e73 to
85fa1ee
Compare
| # error_message = f"Unexpected error while processing entry: {str(e)}." | ||
| import traceback | ||
| traceback.print_exc() | ||
| except Exception as e: |
There was a problem hiding this comment.
I think the generic except Exception case is a bit risky here. For WriterError and ValidationError we actually build an error_message and add it to stream_entry.errors, so the failed entry is tracked properly. But here we only print the traceback and then continue. So if _route() fails because of some unexpected bug, nothing gets added to stream_entry.errors, op_type can stay unset, and the function still returns the entry. That feels like it could leave it in a half-failed state without the failure being tracked clearly. Should we either add an explicit error there too?
| for doc_type in doc_types: | ||
| self.logger.debug(f"Mapping {doc_type} to version.") | ||
| resource_type = INSPIRE_DOCUMENT_TYPE_MAPPING[doc_type] | ||
| self.logger.info(f"Mapped {doc_type} to {resource_type}.") | ||
| if resource_type is not self.main_res_type: | ||
| version_ctx = MetadataSerializationContext( | ||
| resource_type=resource_type, | ||
| inspire_id=self.inspire_id, | ||
| cds_rdm_id=self.cds_id, | ||
| ) | ||
| mappers = self.policy.build_for(resource_type) | ||
| assert_unique_ids(mappers) | ||
| patches = [ | ||
| m.apply(self.inspire_record, version_ctx, self.logger) | ||
| for m in mappers | ||
| ] | ||
|
|
There was a problem hiding this comment.
I might be misunderstanding this, but is there a risk of generating duplicate versions for the same target resource type here? split() loops over every raw INSPIRE document_type, and if two different doc types map to the same CDS resource_type, it looks like we would append two versions with the same target type. Then later the writer would process that same resource type more than once. I’m also wondering if the unused _PREPRINT_DOC_TYPES constant was meant to group some of these doc types together instead.
| source = abstract.get("source", "").lower() | ||
| if source and source not in ["arxiv", "cds"]: | ||
| return abstract["value"] | ||
| return abstracts[0]["value"] No newline at end of file |
There was a problem hiding this comment.
i think this may return too early. Because the fallback return abstracts[0]["value"] is inside the loop, it looks like we only really inspect the first abstract. So if a preferred non-arxiv/non-cds abstract appears later in the list, we would never reach it. Was the fallback meant to be outside the loop?
| source = abstract.get("source", "").lower() | ||
| if source and source in ["arxiv", "cds"]: | ||
| return abstract["value"] | ||
| return abstracts[0]["value"] No newline at end of file |
There was a problem hiding this comment.
I think this may have the same early return issue as article.py.
* change (harvester): skip update if metadata is equal # Conflicts: # site/cds_rdm/inspire_harvester/writer.py
refactor: exception handling # Conflicts: # site/cds_rdm/legacy/redirector.py # Conflicts: # site/cds_rdm/legacy/redirector.py
* refactor(harvester): add specialised classes to handle
drafts, files, matching records
* change(harvester): usage of resource types
* change(harvester): add license mapping
6e1a803 to
0d84f9d
Compare
| f"Imported files to {draft.id} from previous version: {record.id}" | ||
| ) | ||
|
|
||
| new_files = incoming_record["files"].get("entries", {}) |
There was a problem hiding this comment.
was it not .files? and it should always exist right? Otherwise we might wanna do get to not error out.
| source = abstract.get("source", "").lower() | ||
| if source and source not in ["arxiv", "cds"]: | ||
| return abstract["value"] | ||
| return abstracts[0]["value"] No newline at end of file |
| filename = file["filename"] | ||
| source = file.get("source") | ||
| url = file["url"] | ||
| if "pdf" not in filename: |
There was a problem hiding this comment.
I think it would be better to use .endswith(".pdf") so that we don't miss edge cases like "Fulltext_pdf_paper"
|
|
||
| def filter(self, doi): | ||
| """Filter doi based on given criteria.""" | ||
| return True |
There was a problem hiding this comment.
Is it added for a future use case?
| from cds_rdm.inspire_harvester.utils import assert_unique_ids, deep_merge_all | ||
|
|
||
| # Doc types that belong to the arXiv/preprint stream | ||
| _PREPRINT_DOC_TYPES = frozenset({"report", "note", "activity report"}) |
There was a problem hiding this comment.
Is this used somewhere? Is it needed?
| else: | ||
| self._create_record(stream_entry) | ||
| return "create" |
There was a problem hiding this comment.
Is it possible to have a new entry (first time harvest) with multiple resource types?
| del new_version_entry["pids"]["oai"] | ||
| if new_version_entry["pids"]["doi"]["provider"] != "external": | ||
| del new_version_entry["pids"]["doi"] |
There was a problem hiding this comment.
Is it possible to have an entry without oai or doi?
| def compute_diff(self, existing_files, new_files) -> FileDiff: | ||
| """Return the set difference between existing and new file checksums.""" | ||
| existing_checksums = [value["checksum"] for value in existing_files.values()] | ||
| new_checksums = [value["checksum"] for value in new_files.values()] |
There was a problem hiding this comment.
Since arxiv files dont have a checksum here and we generate the checksum when file uploaded, if we receive the same arxiv file it'll have None in the new_checksums if I understand correctly. Could this cause the same file to be treated as changed?
| inspire_url = file.get("source_url") | ||
| file_content = self.fetch(inspire_url, logger) | ||
| self._upload_file(draft, file, file_content, logger) | ||
| logger.info(f"{len(new_files)} files successfully created.") |
There was a problem hiding this comment.
| logger.info(f"{len(new_files)} files successfully created.") | |
| logger.info(f"{len(diff.to_add)} files successfully created.") |
| ror_ids = [] | ||
| if affiliations_identifiers: | ||
| ror_ids = [ | ||
| normalize_ror(ai["value"]) | ||
| for ai in affiliations_identifiers | ||
| if ai.get("schema") == "ROR" | ||
| ] | ||
|
|
||
| for i, affiliation in enumerate(affiliations): | ||
| if i < len(ror_ids): | ||
| mapped_affiliations.append({"id": ror_ids[i]}) | ||
| else: | ||
| value = affiliation.get("value") | ||
| if value: | ||
| mapped_affiliations.append({"name": value.rstrip(".").strip()}) |
There was a problem hiding this comment.
Is it possible for affiliations and ror_ids to have different lengths? If so, could we end up assigning the wrong ROR to an affiliation because the mapping relies on the index?
| return False | ||
| if (not material or material == "preprint") and source == "arxiv": | ||
| return True | ||
| if source == "CDS": |
There was a problem hiding this comment.
| if source == "CDS": | |
| if source.lower() == "cds": |
isn't it safer like this?
Uh oh!
There was an error while loading. Please reload this page.