diff --git a/cove/cove_360/fixtures/badfile_too_many_rows.xlsx b/cove/cove_360/fixtures/badfile_too_many_rows.xlsx new file mode 100644 index 00000000..00806a60 Binary files /dev/null and b/cove/cove_360/fixtures/badfile_too_many_rows.xlsx differ diff --git a/cove/cove_360/fixtures/badfile_too_many_rows_multiple_sheets.xlsx b/cove/cove_360/fixtures/badfile_too_many_rows_multiple_sheets.xlsx new file mode 100644 index 00000000..b69d898d Binary files /dev/null and b/cove/cove_360/fixtures/badfile_too_many_rows_multiple_sheets.xlsx differ diff --git a/cove/cove_360/tests/test_browser.py b/cove/cove_360/tests/test_browser.py index cdf37448..9b6506ea 100644 --- a/cove/cove_360/tests/test_browser.py +++ b/cove/cove_360/tests/test_browser.py @@ -150,31 +150,17 @@ def server_url(request, live_server): 'bad currency', ], True), ('badfile_all_validation_errors_4_times.xlsx', [ - 'Description is missing but required (more info about this error)', - 'id is missing but required within recipientOrganization (more info about this error)', - 'Date is not in the correct format (more info about this error)', - 'Amount Awarded is not a number. Check that the value is not null, and doesn’t contain any characters other than 0-9 and dot (.). Number values should not be in quotes.', - 'Invalid \'uri\' found (more info about this error)', - 'Invalid code found in Currency (more info about this error)', - '[] is too short. You must supply at least one value, or remove the item entirely (unless it’s required).', - # Context dates should be ISO formatted - '2019-06-01T00:00:00+00:00', - 'bad date 1', - 'bad date 2', - 'bad date 3', - 'bad date 4', - 'This should be a number', - 'This should be a uri 1', - 'This should be a uri 2', - # 'This should be a uri 3', - 'This should be a uri 5', - 'This should be a uri 6', - # 'This should be a uri 7', - 'bad currency 1', - 'bad currency 2', - 'bad currency 3', - 'bad currency 4', + '', ], True), + ('badfile_too_many_rows.xlsx', [ + 'This XLSX workbook has a worksheet (grants) with 50001 rows ' + 'but the maximum number of rows supported by this tool is 50000' + ], False), + ('badfile_too_many_rows_multiple_sheets.xlsx', [ + 'This XLSX workbook has worksheets with a larger number of rows ' + 'than is supported by this tool (50000). Worksheets with too many rows: ' + '\'grants\' (50001 rows), \'grants_2\' (50002 rows)' + ], False), ("dei_extension.xlsx", [ "do not use the 360Giving Data Standard codelists correctly.", ], True), diff --git a/cove/cove_360/views.py b/cove/cove_360/views.py index 11ada8aa..c65a0b89 100644 --- a/cove/cove_360/views.py +++ b/cove/cove_360/views.py @@ -5,6 +5,7 @@ import logging import re import os +import zipfile from decimal import Decimal from cove.views import explore_data_context, cove_web_input_error @@ -17,6 +18,8 @@ from django.utils.translation import gettext_lazy as _ from django.core.cache import cache +import openpyxl + from libcove.config import LibCoveConfig from libcove.lib.converters import convert_spreadsheet, convert_json from libcove.lib.exceptions import CoveInputDataError @@ -145,6 +148,40 @@ def explore_360(request, pk, template='cove_360/explore.html'): lib_cove_config=lib_cove_config)) else: + if file_type == "xlsx": + + # Check for an excessive number of rows before passing to flattentool. + excessive_sheets = {} + try: + workbook = openpyxl.reader.excel.load_workbook(file_name, read_only=True) + excessive_sheets = { + sheetname: workbook[sheetname].max_row + for sheetname in workbook.sheetnames + if workbook[sheetname].max_row > settings.MAX_XLSX_ROWS + } + + except (zipfile.BadZipFile, openpyxl.utils.exceptions.InvalidFileException): + # Exceptions associated with invalid spreadsheets are passed through for cove to handle. + pass + + if len(excessive_sheets) == 1: + sheetname = next(iter(excessive_sheets)) + wrapped_err = f"This XLSX workbook has a worksheet ({sheetname}) with " \ + f"{excessive_sheets[sheetname]} rows " \ + f"but the maximum number of rows supported by this tool is {settings.MAX_XLSX_ROWS}" + raise CoveInputDataError(wrapped_err=wrapped_err) + elif len(excessive_sheets) > 1: + wrapped_err = "This XLSX workbook has worksheets with a larger number of rows " \ + f"than is supported by this tool ({settings.MAX_XLSX_ROWS}). " \ + "Worksheets with too many rows: " + wrapped_err += ", ".join( + [ + f"'{sheetname}' ({num_rows} rows)" + for sheetname, num_rows in excessive_sheets.items() + ] + ) + raise CoveInputDataError(wrapped_err=wrapped_err) + # Convert spreadsheet to json context.update(convert_spreadsheet(upload_dir, upload_url, file_name, file_type, lib_cove_config, schema_360.schema_file, schema_360.pkg_schema_file)) diff --git a/cove/cove_project/settings.py b/cove/cove_project/settings.py index 0057c34f..60c079b0 100644 --- a/cove/cove_project/settings.py +++ b/cove/cove_project/settings.py @@ -12,6 +12,7 @@ MEDIA_URL=(str, "/media/"), STATIC_ROOT=(str, os.path.join(BASE_DIR, "static")), STATIC_URL=(str, "/static/"), + MAX_XLSX_ROWS=(int, 50000), ) # We use the setting to choose whether to show the section about Sentry in the @@ -35,6 +36,7 @@ SECRET_KEY = settings.SECRET_KEY DEBUG = settings.DEBUG ALLOWED_HOSTS = settings.ALLOWED_HOSTS +MAX_XLSX_ROWS = env("MAX_XLSX_ROWS") MIDDLEWARE = ( 'django.contrib.sessions.middleware.SessionMiddleware',