feat: add google_scholar client#424
Conversation
There was a problem hiding this comment.
Code Review
This pull request introduces a new Google Scholar retrieval backend using SerpAPI, makes the --api-url parameter optional to support backend-specific defaults, and adds support for extracting and prefixing query instructions for specific MTEB tasks (such as IFIR tasks). The review feedback highlights several opportunities to improve robustness, including safer handling of null summary values, ensuring fallback author parsing when name keys are missing, using a reverse-token scan to more accurately extract publication years, and implementing a stricter check for empty or NaN values in query instructions.
Important
The consumer version of Gemini Code Assist on GitHub is being sunset. Starting June 18, 2026, new organization installations will be blocked, and all code review activity will officially cease on July 17, 2026.
For more details on the timeline and next steps, please review the Help Documentation.
| def _parse_result(item: dict[str, Any], rank: int) -> PaperResult: | ||
| publication_info = item.get("publication_info") or {} | ||
| authors = GoogleScholarClient._parse_authors(publication_info) | ||
| year = GoogleScholarClient._parse_year(publication_info.get("summary", "")) |
There was a problem hiding this comment.
If summary is explicitly set to None in the publication_info dictionary, publication_info.get("summary", "") will return None instead of "". This causes _parse_year to receive None and convert it to "None". Using publication_info.get("summary") or "" is safer and more idiomatic.
| year = GoogleScholarClient._parse_year(publication_info.get("summary", "")) | |
| year = GoogleScholarClient._parse_year(publication_info.get("summary") or "") |
| @staticmethod | ||
| def _parse_authors(publication_info: dict[str, Any]) -> list[str]: | ||
| authors = publication_info.get("authors") or [] | ||
| if isinstance(authors, list) and authors: | ||
| return [ | ||
| str(author.get("name") or "") | ||
| for author in authors | ||
| if isinstance(author, dict) and author.get("name") | ||
| ] | ||
|
|
||
| summary = str(publication_info.get("summary") or "") | ||
| if not summary: | ||
| return [] | ||
| author_part = summary.split(" - ", 1)[0] | ||
| return [name.strip() for name in author_part.split(",") if name.strip()] |
There was a problem hiding this comment.
In _parse_authors, if authors is a list of dicts but none of them contain a "name" key, the list comprehension returns an empty list []. However, because the if isinstance(authors, list) and authors: block is entered, the function immediately returns [] and never falls back to parsing the summary string. We should only return the parsed list if it actually contains elements, otherwise fall back to parsing the summary.
| @staticmethod | |
| def _parse_authors(publication_info: dict[str, Any]) -> list[str]: | |
| authors = publication_info.get("authors") or [] | |
| if isinstance(authors, list) and authors: | |
| return [ | |
| str(author.get("name") or "") | |
| for author in authors | |
| if isinstance(author, dict) and author.get("name") | |
| ] | |
| summary = str(publication_info.get("summary") or "") | |
| if not summary: | |
| return [] | |
| author_part = summary.split(" - ", 1)[0] | |
| return [name.strip() for name in author_part.split(",") if name.strip()] | |
| @staticmethod | |
| def _parse_authors(publication_info: dict[str, Any]) -> list[str]: | |
| authors = publication_info.get("authors") or [] | |
| if isinstance(authors, list) and authors: | |
| parsed = [ | |
| str(author.get("name") or "") | |
| for author in authors | |
| if isinstance(author, dict) and author.get("name") | |
| ] | |
| if parsed: | |
| return parsed | |
| summary = str(publication_info.get("summary") or "") | |
| if not summary: | |
| return [] | |
| author_part = summary.split(" - ", 1)[0] | |
| return [name.strip() for name in author_part.split(",") if name.strip()] |
| @staticmethod | ||
| def _parse_year(text: str) -> int | None: | ||
| for token in str(text).replace(",", " ").split(): | ||
| if token.isdigit() and len(token) == 4: | ||
| year = int(token) | ||
| if 1000 <= year <= 3000: | ||
| return year | ||
| return None |
There was a problem hiding this comment.
In _parse_year, iterating through the tokens from left to right can incorrectly extract a 4-digit number that is not a year (e.g., "A Author - Journal of 1200 Cases, 2024" would return 1200 instead of 2024). Since publication years are almost always at the end of the summary/citation string, iterating through the tokens in reverse order (from right to left) is a much more robust heuristic.
| @staticmethod | |
| def _parse_year(text: str) -> int | None: | |
| for token in str(text).replace(",", " ").split(): | |
| if token.isdigit() and len(token) == 4: | |
| year = int(token) | |
| if 1000 <= year <= 3000: | |
| return year | |
| return None | |
| @staticmethod | |
| def _parse_year(text: str) -> int | None: | |
| for token in reversed(str(text).replace(",", " ").split()): | |
| if token.isdigit() and len(token) == 4: | |
| year = int(token) | |
| if 1000 <= year <= 3000: | |
| return year | |
| return None |
| def _extract_query_instructions(queries: Any, total: int) -> list[str | None]: | ||
| for column_name in INSTRUCTION_COLUMNS: | ||
| values = _get_query_column_values(queries, column_name, total) | ||
| if any(value not in (None, "") for value in values): | ||
| return [ | ||
| str(value).strip() if value not in (None, "") else None | ||
| for value in values | ||
| ] | ||
| return [None] * total |
There was a problem hiding this comment.
In _extract_query_instructions, checking value not in (None, "") is insufficient because:
- If a value is a float
NaN(which is common in datasets loaded via pandas/mteb),value not in (None, "")evaluates toTrue, resulting in"nan"being sent as an instruction. - If a value is a whitespace-only string (e.g.,
" "), it will be stripped to""but still treated as a valid instruction.
We should implement a robust helper function to check for empty/null/NaN values and use it here.
| def _extract_query_instructions(queries: Any, total: int) -> list[str | None]: | |
| for column_name in INSTRUCTION_COLUMNS: | |
| values = _get_query_column_values(queries, column_name, total) | |
| if any(value not in (None, "") for value in values): | |
| return [ | |
| str(value).strip() if value not in (None, "") else None | |
| for value in values | |
| ] | |
| return [None] * total | |
| def _extract_query_instructions(queries: Any, total: int) -> list[str | None]: | |
| def is_empty(v: Any) -> bool: | |
| if v in (None, ""): | |
| return True | |
| if isinstance(v, str) and not v.strip(): | |
| return True | |
| if isinstance(v, float) and v != v: | |
| return True | |
| return False | |
| for column_name in INSTRUCTION_COLUMNS: | |
| values = _get_query_column_values(queries, column_name, total) | |
| if any(not is_empty(value) for value in values): | |
| return [ | |
| str(value).strip() if not is_empty(value) else None | |
| for value in values | |
| ] | |
| return [None] * total |
No description provided.