Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 20 additions & 5 deletions core/utils/sanitize.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
import json


def sanitize_for_json(obj):
"""Recursively sanitize data to remove Unicode surrogate characters.
"""Recursively sanitize data to make it JSON-serializable.

Surrogate characters (U+D800-U+DFFF) are invalid in JSON and rejected by
PostgreSQL. They can appear in file paths read from filesystems using
Python's 'surrogateescape' error handler.
Handles:
- Unicode surrogate characters (U+D800-U+DFFF), which are invalid in JSON
and rejected by PostgreSQL. They can appear in file paths read from
filesystems using Python's 'surrogateescape' error handler.
- Django lazy translation objects (``__proxy__``) and any other
non-JSON-serializable types, which are converted to their string
representation.
"""
if isinstance(obj, str):
# Encode using surrogateescape to recover original bytes from surrogates,
Expand All @@ -17,4 +24,12 @@ def sanitize_for_json(obj):
return {sanitize_for_json(k): sanitize_for_json(v) for k, v in obj.items()}
if isinstance(obj, (list, tuple)):
return [sanitize_for_json(item) for item in obj]
return obj
# For any other type, test JSON serializability directly. If the object
# serializes fine (int, float, bool, None, …) return it unchanged.
# Otherwise convert to string and re-sanitize to also catch surrogate code
# points that __str__() might produce (e.g. Django lazy __proxy__).
try:
json.dumps(obj)
return obj
except (TypeError, ValueError):
return sanitize_for_json(str(obj))
47 changes: 45 additions & 2 deletions proc/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,49 @@ def test_high_surrogate_handled(self):
json.dumps(result) # Must not raise
self.assertNotIn("\ud800", result)

def test_django_lazy_proxy_converted_to_string(self):
"""Django lazy translation objects (__proxy__) must be converted to str."""
from django.utils.translation import gettext_lazy as _

lazy_text = _("Select journals by collection")
result = sanitize_for_json(lazy_text)
self.assertIsInstance(result, str)
self.assertEqual(result, str(lazy_text))
json.dumps(result) # Must not raise

def test_list_with_lazy_proxy_converted(self):
"""A list containing a lazy proxy object must be fully serializable."""
from django.utils.translation import gettext_lazy as _

Comment on lines +101 to +113
Copy link

Copilot AI Apr 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These new tests import Django (gettext_lazy) but the file still exposes a __main__ runner (python proc/tests.py). Running it directly will now fail unless Django settings are configured. Either remove the __main__ block or configure DJANGO_SETTINGS_MODULE/django.setup() there to keep the standalone runner functional.

Copilot uses AI. Check for mistakes.
data = [_("Select journals by collection"), "normal string", 42]
result = sanitize_for_json(data)
json_str = json.dumps(result) # Must not raise
parsed = json.loads(json_str)
self.assertIsInstance(parsed[0], str)
self.assertEqual(parsed[1], "normal string")
self.assertEqual(parsed[2], 42)

def test_dict_with_lazy_proxy_value_converted(self):
"""A dict containing a lazy proxy value must be fully serializable."""
from django.utils.translation import gettext_lazy as _

data = {"events": [_("Select journals by collection")], "count": 1}
result = sanitize_for_json(data)
json_str = json.dumps(result) # Must not raise
parsed = json.loads(json_str)
self.assertIsInstance(parsed["events"][0], str)
self.assertEqual(parsed["count"], 1)

def test_unknown_object_converted_to_string(self):
"""Any unknown non-JSON-serializable object is converted to its str repr."""

class CustomObj:
def __str__(self):
return "custom"

result = sanitize_for_json(CustomObj())
self.assertEqual(result, "custom")
json.dumps(result) # Must not raise



if __name__ == "__main__":
unittest.main()