diff --git a/core/utils/sanitize.py b/core/utils/sanitize.py index c23913b47..ebaa4f774 100644 --- a/core/utils/sanitize.py +++ b/core/utils/sanitize.py @@ -1,9 +1,16 @@ +import json + + def sanitize_for_json(obj): - """Recursively sanitize data to remove Unicode surrogate characters. + """Recursively sanitize data to make it JSON-serializable. - Surrogate characters (U+D800-U+DFFF) are invalid in JSON and rejected by - PostgreSQL. They can appear in file paths read from filesystems using - Python's 'surrogateescape' error handler. + Handles: + - Unicode surrogate characters (U+D800-U+DFFF), which are invalid in JSON + and rejected by PostgreSQL. They can appear in file paths read from + filesystems using Python's 'surrogateescape' error handler. + - Django lazy translation objects (``__proxy__``) and any other + non-JSON-serializable types, which are converted to their string + representation. """ if isinstance(obj, str): # Encode using surrogateescape to recover original bytes from surrogates, @@ -17,4 +24,12 @@ def sanitize_for_json(obj): return {sanitize_for_json(k): sanitize_for_json(v) for k, v in obj.items()} if isinstance(obj, (list, tuple)): return [sanitize_for_json(item) for item in obj] - return obj + # For any other type, test JSON serializability directly. If the object + # serializes fine (int, float, bool, None, …) return it unchanged. + # Otherwise convert to string and re-sanitize to also catch surrogate code + # points that __str__() might produce (e.g. Django lazy __proxy__). + try: + json.dumps(obj) + return obj + except (TypeError, ValueError): + return sanitize_for_json(str(obj)) diff --git a/proc/tests.py b/proc/tests.py index d0af74de6..49ab0d4d4 100644 --- a/proc/tests.py +++ b/proc/tests.py @@ -97,6 +97,49 @@ def test_high_surrogate_handled(self): json.dumps(result) # Must not raise self.assertNotIn("\ud800", result) + def test_django_lazy_proxy_converted_to_string(self): + """Django lazy translation objects (__proxy__) must be converted to str.""" + from django.utils.translation import gettext_lazy as _ + + lazy_text = _("Select journals by collection") + result = sanitize_for_json(lazy_text) + self.assertIsInstance(result, str) + self.assertEqual(result, str(lazy_text)) + json.dumps(result) # Must not raise + + def test_list_with_lazy_proxy_converted(self): + """A list containing a lazy proxy object must be fully serializable.""" + from django.utils.translation import gettext_lazy as _ + + data = [_("Select journals by collection"), "normal string", 42] + result = sanitize_for_json(data) + json_str = json.dumps(result) # Must not raise + parsed = json.loads(json_str) + self.assertIsInstance(parsed[0], str) + self.assertEqual(parsed[1], "normal string") + self.assertEqual(parsed[2], 42) + + def test_dict_with_lazy_proxy_value_converted(self): + """A dict containing a lazy proxy value must be fully serializable.""" + from django.utils.translation import gettext_lazy as _ + + data = {"events": [_("Select journals by collection")], "count": 1} + result = sanitize_for_json(data) + json_str = json.dumps(result) # Must not raise + parsed = json.loads(json_str) + self.assertIsInstance(parsed["events"][0], str) + self.assertEqual(parsed["count"], 1) + + def test_unknown_object_converted_to_string(self): + """Any unknown non-JSON-serializable object is converted to its str repr.""" + + class CustomObj: + def __str__(self): + return "custom" + + result = sanitize_for_json(CustomObj()) + self.assertEqual(result, "custom") + json.dumps(result) # Must not raise + + -if __name__ == "__main__": - unittest.main()