diff --git a/docs/changelog.rst b/docs/changelog.rst index 0d8ce671..5a4fffd3 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -13,7 +13,8 @@ NEXT RELEASE TBD ------------------- -* Minor nitpick to remove unused line (`#559`_ by `@KyleKing`_) +* Minor nitpick to remove unused line (`#562`_ by `@KyleKing`_) +* Utilize openpyxl for xlsx after xlrd dropped support (`#559`_ by `@KyleKing`_) 2.0.0 ------------------- diff --git a/pyproject.toml b/pyproject.toml index 3509715a..182089f3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,8 @@ dependencies = [ "extract-msg>=0.40.0", "pdfminer.six>=20221105", "python-pptx>=0.6.18", - "xlrd>=1.2.0,<2.0.0", # v2 drops support for xlsx. Requires openpyxl or alternative to support xlsx if upgrading xlrd. See: https://github.com/deanmalmgren/textract/pull/543#issuecomment-2684619988 + "openpyxl>=3.0.0", # Replaces xlrd for Excel file support (xlrd 2.0+ dropped xlsx support) + "xlrd>=2.0.0", # Supports legacy .xls files (openpyxl does not support .xls) ] [project.optional-dependencies] diff --git a/textract/parsers/xls_parser.py b/textract/parsers/xls_parser.py index e73df756..484559d8 100644 --- a/textract/parsers/xls_parser.py +++ b/textract/parsers/xls_parser.py @@ -1,3 +1,28 @@ -from .xlsx_parser import Parser +import xlrd -__all__ = ["Parser"] +from .utils import BaseParser + + +class Parser(BaseParser): + """Extract text from legacy Excel files (.xls).""" + + def extract(self, filename, **kwargs): + workbook = xlrd.open_workbook(filename) + sheets_name = workbook.sheet_names() + output = "\n" + for names in sheets_name: + worksheet = workbook.sheet_by_name(names) + num_rows = worksheet.nrows + num_cells = worksheet.ncols + + for curr_row in range(num_rows): + new_output = [] + for index_col in range(num_cells): + value = worksheet.cell_value(curr_row, index_col) + if value: + if isinstance(value, (int, float)): + value = str(value) + new_output.append(value) + if new_output: + output += " ".join(new_output) + "\n" + return output diff --git a/textract/parsers/xlsx_parser.py b/textract/parsers/xlsx_parser.py index ccbf3115..e60066ac 100644 --- a/textract/parsers/xlsx_parser.py +++ b/textract/parsers/xlsx_parser.py @@ -1,28 +1,31 @@ -import xlrd +import openpyxl from .utils import BaseParser class Parser(BaseParser): - """Extract text from Excel files (.xls/xlsx).""" + """Extract text from Excel files (.xlsx).""" def extract(self, filename, **kwargs): - workbook = xlrd.open_workbook(filename) - sheets_name = workbook.sheet_names() + workbook = openpyxl.load_workbook(filename, data_only=True) output = "\n" - for names in sheets_name: - worksheet = workbook.sheet_by_name(names) - num_rows = worksheet.nrows - num_cells = worksheet.ncols + for sheet_name in workbook.sheetnames: + worksheet = workbook[sheet_name] + + for row in worksheet.iter_rows(values_only=True): + non_empty_values = [] + for value in row: + if value is None or value is False: + continue + if isinstance(value, bool): + value = "1" + elif isinstance(value, (int, float)): + # Convert to float to preserve decimal format (e.g., 83 -> 83.0) + value = str(float(value)) + else: + value = str(value) + non_empty_values.append(value) + if non_empty_values: + output += " ".join(non_empty_values) + "\n" - for curr_row in range(num_rows): - new_output = [] - for index_col in range(num_cells): - value = worksheet.cell_value(curr_row, index_col) - if value: - if isinstance(value, (int, float)): - value = str(value) - new_output.append(value) - if new_output: - output += " ".join(new_output) + "\n" return output diff --git a/uv.lock b/uv.lock index 8336f51c..6530e5fe 100644 --- a/uv.lock +++ b/uv.lock @@ -545,6 +545,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0d/2f/633031205333bee5f9f93761af8268746aa75f38754823aabb8570eb245b/ebcdic-1.1.1-py2.py3-none-any.whl", hash = "sha256:33b4cb729bc2d0bf46cc1847b0e5946897cb8d3f53520c5b9aa5fa98d7e735f1", size = 128537, upload-time = "2019-08-09T00:54:35.544Z" }, ] +[[package]] +name = "et-xmlfile" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" }, +] + [[package]] name = "exceptiongroup" version = "1.3.1" @@ -972,6 +981,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ac/ff/05257b7183279b80ecec6333744de23f48f0faeeba46c93e6d13ce835515/oletools-0.60.2-py2.py3-none-any.whl", hash = "sha256:72ad8bd748fd0c4e7b5b4733af770d11543ebb2bf2697455f99f975fcd50cc96", size = 989449, upload-time = "2024-07-02T14:50:29.122Z" }, ] +[[package]] +name = "openpyxl" +version = "3.1.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "et-xmlfile" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" }, +] + [[package]] name = "packaging" version = "26.0" @@ -1832,6 +1853,7 @@ dependencies = [ { name = "docx2txt" }, { name = "extract-msg" }, { name = "lxml" }, + { name = "openpyxl" }, { name = "pdfminer-six", version = "20251107", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "pdfminer-six", version = "20260107", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "pillow", version = "11.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, @@ -1876,6 +1898,7 @@ requires-dist = [ { name = "docx2txt", specifier = ">=0.8" }, { name = "extract-msg", specifier = ">=0.40.0" }, { name = "lxml", specifier = ">=4.9.0" }, + { name = "openpyxl", specifier = ">=3.0.0" }, { name = "pdfminer-six", specifier = ">=20221105" }, { name = "pillow", specifier = ">=9.0.0" }, { name = "pocketsphinx", marker = "extra == 'pocketsphinx'", specifier = ">=0.1.15" }, @@ -1884,7 +1907,7 @@ requires-dist = [ { name = "sphinx", marker = "extra == 'docs'", specifier = ">=7.0" }, { name = "sphinx-argparse", marker = "extra == 'docs'", specifier = ">=0.4" }, { name = "sphinx-rtd-theme", marker = "extra == 'docs'", specifier = ">=2.0" }, - { name = "xlrd", specifier = ">=1.2.0,<2.0.0" }, + { name = "xlrd", specifier = ">=2.0.0" }, ] provides-extras = ["docs", "pocketsphinx"] @@ -2001,11 +2024,11 @@ sdist = { url = "https://files.pythonhosted.org/packages/89/8d/7aad74930380c8972 [[package]] name = "xlrd" -version = "1.2.0" +version = "2.0.2" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/aa/05/ec9d4fcbbb74bbf4da9f622b3b61aec541e4eccf31d3c60c5422ec027ce2/xlrd-1.2.0.tar.gz", hash = "sha256:546eb36cee8db40c3eaa46c351e67ffee6eeb5fa2650b71bc4c758a29a1b29b2", size = 554079, upload-time = "2018-12-15T17:47:48.133Z" } +sdist = { url = "https://files.pythonhosted.org/packages/07/5a/377161c2d3538d1990d7af382c79f3b2372e880b65de21b01b1a2b78691e/xlrd-2.0.2.tar.gz", hash = "sha256:08b5e25de58f21ce71dc7db3b3b8106c1fa776f3024c54e45b45b374e89234c9", size = 100167, upload-time = "2025-06-14T08:46:39.039Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b0/16/63576a1a001752e34bf8ea62e367997530dc553b689356b9879339cf45a4/xlrd-1.2.0-py2.py3-none-any.whl", hash = "sha256:e551fb498759fa3a5384a94ccd4c3c02eb7c00ea424426e212ac0c57be9dfbde", size = 103251, upload-time = "2018-12-15T17:47:45.792Z" }, + { url = "https://files.pythonhosted.org/packages/1a/62/c8d562e7766786ba6587d09c5a8ba9f718ed3fa8af7f4553e8f91c36f302/xlrd-2.0.2-py2.py3-none-any.whl", hash = "sha256:ea762c3d29f4cca48d82df517b6d89fbce4db3107f9d78713e48cd321d5c9aa9", size = 96555, upload-time = "2025-06-14T08:46:37.766Z" }, ] [[package]]