From 7b0a63101ab08b86133231ae29c2ba095f6da894 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 6 Mar 2026 10:27:13 +0000 Subject: [PATCH 1/3] Initial plan From f3614dbb4d948e4838203132f1e32536b97608e5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 6 Mar 2026 10:36:35 +0000 Subject: [PATCH 2/3] Initial plan for XMLCrossmarkPipe implementation Co-authored-by: robertatakenaka <505143+robertatakenaka@users.noreply.github.com> --- src/thriftpywrap | 1 + src/xylose | 1 + 2 files changed, 2 insertions(+) create mode 160000 src/thriftpywrap create mode 160000 src/xylose diff --git a/src/thriftpywrap b/src/thriftpywrap new file mode 160000 index 0000000..c3fe4f8 --- /dev/null +++ b/src/thriftpywrap @@ -0,0 +1 @@ +Subproject commit c3fe4f84acc7eee21f27f573df973cba0d05f58a diff --git a/src/xylose b/src/xylose new file mode 160000 index 0000000..2d63ae9 --- /dev/null +++ b/src/xylose @@ -0,0 +1 @@ +Subproject commit 2d63ae91840f28477a23e7d215e71b8b9b916c03 From f23a52971a8b9484de2b0a5606fad61bd2d6ec67 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 6 Mar 2026 10:45:40 +0000 Subject: [PATCH 3/3] Add XMLCrossmarkPipe for CrossRef XML generation with crossmark support Co-authored-by: robertatakenaka <505143+robertatakenaka@users.noreply.github.com> --- articlemeta/export.py | 1 + articlemeta/export_crossref.py | 120 ++++++++++++++ tests/test_export_crossref.py | 276 ++++++++++++++++++++++++++++++++- 3 files changed, 396 insertions(+), 1 deletion(-) diff --git a/articlemeta/export.py b/articlemeta/export.py index 7d254ee..f3b2971 100644 --- a/articlemeta/export.py +++ b/articlemeta/export.py @@ -209,6 +209,7 @@ def pipeline_crossref(self): export_crossref.XMLPIDPipe(), export_crossref.XMLElocationPipe(), export_crossref.XMLPermissionsPipe(), + export_crossref.XMLCrossmarkPipe(), export_crossref.XMLProgramRelatedItemPipe(), export_crossref.XMLDOIDataPipe(), export_crossref.XMLDOIPipe(), diff --git a/articlemeta/export_crossref.py b/articlemeta/export_crossref.py index e63883f..06ed589 100644 --- a/articlemeta/export_crossref.py +++ b/articlemeta/export_crossref.py @@ -13,6 +13,12 @@ SUPPLBEG_REGEX = re.compile(r'^0 ') SUPPLEND_REGEX = re.compile(r' 0$') +CROSSMARK_UPDATE_TYPES = frozenset([ + 'addendum', 'clarification', 'correction', 'corrigendum', 'erratum', + 'expression_of_concern', 'new_edition', 'new_version', 'partial_retraction', + 'removal', 'retraction', 'withdrawal', +]) + class SetupDoiBatchPipe(plumber.Pipe): @@ -1202,6 +1208,120 @@ def _transform_translations(self, data): return data +class XMLCrossmarkPipe(plumber.Pipe): + """Adds the element to each journal_article element. + + The crossmark_policy value is read from the CROSSMARK_POLICY environment + variable. When the variable is empty or absent the pipe is skipped. + + Optional updates (corrections, retractions, etc.) are read from the + ``related_articles`` list stored at the top level of the raw article JSON. + Each item in that list is expected to be a dict with at least the keys + ``type`` (one of the 12 Crossref update types) and ``doi``. An optional + ``date`` key (format YYYY-MM-DD, YYYY-MM, YYYYMM, or YYYY/MM/DD) is used to populate + the ```` child element. + """ + + def precond(data): + raw, _ = data + if not os.environ.get('CROSSMARK_POLICY', ''): + raise plumber.UnmetPrecondition() + + @staticmethod + def _build_date_element(date_str): + """Return a ```` element or *None*.""" + if not date_str: + return None + # Normalise separators + clean = date_str.replace('-', '').replace('/', '') + if len(clean) < 4: + return None + date_el = ET.Element('date') + date_el.set('media_type', 'online') + year_text = clean[0:4] + month_text = clean[4:6] if len(clean) >= 6 else '' + if month_text: + month_el = ET.Element('month') + month_el.text = month_text + date_el.append(month_el) + year_el = ET.Element('year') + year_el.text = year_text + date_el.append(year_el) + return date_el + + @staticmethod + def _build_updates_element(related_articles): + """Return an ```` element populated from *related_articles*. + + Items whose ``type`` is not one of the 12 recognised Crossref update + types, or that are missing a ``doi``, are silently ignored. + """ + updates_el = ET.Element('updates') + for item in related_articles: + update_type = item.get('type', '') + update_doi = item.get('doi', '') + if not update_type or not update_doi: + continue + if update_type not in CROSSMARK_UPDATE_TYPES: + continue + update_el = ET.Element('update') + update_el.set('type', update_type) + doi_el = ET.Element('doi') + doi_el.text = update_doi + update_el.append(doi_el) + date_el = XMLCrossmarkPipe._build_date_element(item.get('date', '')) + if date_el is not None: + update_el.append(date_el) + updates_el.append(update_el) + return updates_el + + @plumber.precondition(precond) + def transform(self, data): + raw, xml = data + + policy = os.environ.get('CROSSMARK_POLICY', '') + + crossmark = ET.Element('crossmark') + + version_el = ET.Element('crossmark_version') + version_el.text = '1' + crossmark.append(version_el) + + policy_el = ET.Element('crossmark_policy') + policy_el.text = policy + crossmark.append(policy_el) + + scielo_domain = getattr(raw, 'scielo_domain', None) + if scielo_domain: + domains_el = ET.Element('crossmark_domains') + domain_el = ET.Element('crossmark_domain') + d_el = ET.Element('domain') + d_el.text = scielo_domain + domain_el.append(d_el) + domains_el.append(domain_el) + crossmark.append(domains_el) + + exclusive_el = ET.Element('crossmark_domain_exclusive') + exclusive_el.text = 'true' + crossmark.append(exclusive_el) + + related_articles = [] + try: + related_articles = raw.data.get('related_articles') or [] + except Exception: + pass + + if related_articles: + updates_el = self._build_updates_element(related_articles) + if len(updates_el): + crossmark.append(updates_el) + + for journal_article in xml.findall('./body/journal//journal_article'): + journal_article.append(deepcopy(crossmark)) + + return data + + class XMLFundingDataPipe(plumber.Pipe): def precond(data): raw, _ = data diff --git a/tests/test_export_crossref.py b/tests/test_export_crossref.py index b44a1bf..878f154 100644 --- a/tests/test_export_crossref.py +++ b/tests/test_export_crossref.py @@ -2149,4 +2149,278 @@ def test_funding_data_element(self): publisher_item = xml.xpath(".//journal_article/publisher_item")[-1] self.assertEqual(publisher_item.getnext().find("*").tag, "{http://www.crossref.org/fundref.xsd}assertion") - self.assertIn("fr:program", ET.tostring(xml).decode("utf-8")) \ No newline at end of file + self.assertIn("fr:program", ET.tostring(xml).decode("utf-8")) + +class ExportCrossRef_XMLCrossmarkPipe_Tests(unittest.TestCase): + """Tests for the XMLCrossmarkPipe.""" + + def _make_xml_with_publisher_item(self): + xml = ET.fromstring( + '' + '' + '' + '' + '' + 'S0034-89102010000400007' + '' + '' + '' + '' + '' + ) + return xml + + def _make_article(self, extra_data=None): + raw_data = { + 'article': { + 'v880': [{'_': 'S0034-89102010000400007'}], + 'v40': [{'_': 'pt'}], + }, + 'title': {}, + 'issue': {'issue': {}}, + } + if extra_data: + raw_data.update(extra_data) + return Article(raw_data) + + def test_crossmark_not_added_when_no_policy_env_var(self): + """When CROSSMARK_POLICY is not set, pipe is skipped.""" + os.environ.pop('CROSSMARK_POLICY', None) + xml = self._make_xml_with_publisher_item() + article = self._make_article() + + pipe = export_crossref.XMLCrossmarkPipe() + raw, result_xml = pipe.transform([article, xml]) + + self.assertEqual([], result_xml.findall('.//crossmark')) + + def test_crossmark_added_when_policy_env_var_set(self): + """When CROSSMARK_POLICY is set, crossmark element is added.""" + os.environ['CROSSMARK_POLICY'] = 'https://www.scielo.br/crossmark-policy' + try: + xml = self._make_xml_with_publisher_item() + article = self._make_article() + + pipe = export_crossref.XMLCrossmarkPipe() + raw, result_xml = pipe.transform([article, xml]) + + crossmark_nodes = result_xml.findall('.//crossmark') + self.assertEqual(1, len(crossmark_nodes)) + finally: + os.environ.pop('CROSSMARK_POLICY', None) + + def test_crossmark_version_is_one(self): + """crossmark_version element has text '1'.""" + os.environ['CROSSMARK_POLICY'] = 'https://www.scielo.br/crossmark-policy' + try: + xml = self._make_xml_with_publisher_item() + article = self._make_article() + + pipe = export_crossref.XMLCrossmarkPipe() + raw, result_xml = pipe.transform([article, xml]) + + version = result_xml.find('.//crossmark/crossmark_version') + self.assertIsNotNone(version) + self.assertEqual('1', version.text) + finally: + os.environ.pop('CROSSMARK_POLICY', None) + + def test_crossmark_policy_contains_env_value(self): + """crossmark_policy element reflects CROSSMARK_POLICY env var.""" + policy_url = 'https://www.scielo.br/crossmark-policy' + os.environ['CROSSMARK_POLICY'] = policy_url + try: + xml = self._make_xml_with_publisher_item() + article = self._make_article() + + pipe = export_crossref.XMLCrossmarkPipe() + raw, result_xml = pipe.transform([article, xml]) + + policy = result_xml.find('.//crossmark/crossmark_policy') + self.assertIsNotNone(policy) + self.assertEqual(policy_url, policy.text) + finally: + os.environ.pop('CROSSMARK_POLICY', None) + + def test_crossmark_domains_from_scielo_domain(self): + """crossmark_domains element is populated from raw.scielo_domain.""" + os.environ['CROSSMARK_POLICY'] = 'https://www.scielo.br/crossmark-policy' + try: + xml = self._make_xml_with_publisher_item() + raw_data = { + 'article': { + 'v880': [{'_': 'S0034-89102010000400007'}], + 'v40': [{'_': 'pt'}], + }, + 'title': {'v690': [{'_': 'www.scielo.br'}]}, + 'issue': {'issue': {}}, + } + article = Article(raw_data) + + pipe = export_crossref.XMLCrossmarkPipe() + raw, result_xml = pipe.transform([article, xml]) + + domain = result_xml.find( + './/crossmark/crossmark_domains/crossmark_domain/domain') + self.assertIsNotNone(domain) + self.assertEqual('www.scielo.br', domain.text) + finally: + os.environ.pop('CROSSMARK_POLICY', None) + + def test_crossmark_no_updates_when_no_related_articles(self): + """No element when related_articles is absent.""" + os.environ['CROSSMARK_POLICY'] = 'https://www.scielo.br/crossmark-policy' + try: + xml = self._make_xml_with_publisher_item() + article = self._make_article() + + pipe = export_crossref.XMLCrossmarkPipe() + raw, result_xml = pipe.transform([article, xml]) + + self.assertIsNone(result_xml.find('.//crossmark/updates')) + finally: + os.environ.pop('CROSSMARK_POLICY', None) + + def test_crossmark_updates_with_related_articles(self): + """ element is created from related_articles data.""" + os.environ['CROSSMARK_POLICY'] = 'https://www.scielo.br/crossmark-policy' + try: + xml = self._make_xml_with_publisher_item() + article = self._make_article({ + 'related_articles': [ + { + 'type': 'erratum', + 'doi': '10.1590/erratum-example-001', + 'date': '2025-03', + } + ] + }) + + pipe = export_crossref.XMLCrossmarkPipe() + raw, result_xml = pipe.transform([article, xml]) + + updates = result_xml.find('.//crossmark/updates') + self.assertIsNotNone(updates) + update = updates.find('update') + self.assertIsNotNone(update) + self.assertEqual('erratum', update.get('type')) + self.assertEqual( + '10.1590/erratum-example-001', + update.find('doi').text + ) + finally: + os.environ.pop('CROSSMARK_POLICY', None) + + def test_crossmark_update_date_elements(self): + """Date elements within are correctly built.""" + os.environ['CROSSMARK_POLICY'] = 'https://www.scielo.br/crossmark-policy' + try: + xml = self._make_xml_with_publisher_item() + article = self._make_article({ + 'related_articles': [ + { + 'type': 'retraction', + 'doi': '10.1590/retraction-001', + 'date': '2025-06', + } + ] + }) + + pipe = export_crossref.XMLCrossmarkPipe() + raw, result_xml = pipe.transform([article, xml]) + + update = result_xml.find('.//crossmark/updates/update') + date_el = update.find('date') + self.assertIsNotNone(date_el) + self.assertEqual('online', date_el.get('media_type')) + children = list(date_el) + self.assertEqual('month', children[0].tag) + self.assertEqual('06', children[0].text) + self.assertEqual('year', children[1].tag) + self.assertEqual('2025', children[1].text) + finally: + os.environ.pop('CROSSMARK_POLICY', None) + + def test_crossmark_update_unknown_type_is_skipped(self): + """Related articles with unrecognised type are silently ignored.""" + os.environ['CROSSMARK_POLICY'] = 'https://www.scielo.br/crossmark-policy' + try: + xml = self._make_xml_with_publisher_item() + article = self._make_article({ + 'related_articles': [ + { + 'type': 'invalid_type', + 'doi': '10.1590/invalid-001', + 'date': '2025-01', + } + ] + }) + + pipe = export_crossref.XMLCrossmarkPipe() + raw, result_xml = pipe.transform([article, xml]) + + self.assertIsNone(result_xml.find('.//crossmark/updates')) + finally: + os.environ.pop('CROSSMARK_POLICY', None) + + def test_crossmark_multiple_updates(self): + """Multiple related articles produce multiple elements.""" + os.environ['CROSSMARK_POLICY'] = 'https://www.scielo.br/crossmark-policy' + try: + xml = self._make_xml_with_publisher_item() + article = self._make_article({ + 'related_articles': [ + { + 'type': 'erratum', + 'doi': '10.1590/erratum-001', + 'date': '2024-06', + }, + { + 'type': 'retraction', + 'doi': '10.1590/retraction-001', + 'date': '2025-03', + }, + ] + }) + + pipe = export_crossref.XMLCrossmarkPipe() + raw, result_xml = pipe.transform([article, xml]) + + updates = result_xml.find('.//crossmark/updates') + self.assertIsNotNone(updates) + self.assertEqual(2, len(updates.findall('update'))) + finally: + os.environ.pop('CROSSMARK_POLICY', None) + + def test_crossmark_funding_pipe_appends_inside_crossmark(self): + """XMLFundingDataPipe appends funding data inside existing crossmark.""" + os.environ['CROSSMARK_POLICY'] = 'https://www.scielo.br/crossmark-policy' + try: + xml = self._make_xml_with_publisher_item() + raw_data = { + 'article': { + 'v880': [{'_': 'S0034-89102010000400007'}], + 'v40': [{'_': 'pt'}], + 'v58': [{'_': 'CNPQ'}], + 'v60': [{'_': '123456'}], + }, + 'title': {}, + 'issue': {'issue': {}}, + } + article = Article(raw_data) + + # First run XMLCrossmarkPipe to insert the crossmark element + crossmark_pipe = export_crossref.XMLCrossmarkPipe() + _, xml = crossmark_pipe.transform([article, xml]) + + # Then run XMLFundingDataPipe + funding_pipe = export_crossref.XMLFundingDataPipe() + _, xml = funding_pipe.transform([article, xml]) + + crossmark = xml.find('.//crossmark') + self.assertIsNotNone(crossmark) + fundref_program = crossmark.find( + '{http://www.crossref.org/fundref.xsd}program') + self.assertIsNotNone(fundref_program) + finally: + os.environ.pop('CROSSMARK_POLICY', None)