Skip to content
Merged
1,016 changes: 667 additions & 349 deletions examples/tutorial-noaa.ipynb

Large diffs are not rendered by default.

1,033 changes: 916 additions & 117 deletions examples/tutorial-pangaea.ipynb

Large diffs are not rendered by default.

97 changes: 16 additions & 81 deletions pyleotups/core/NOAADataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def __init__(self):
self.data_table_index = {} # dataTableID -> dict with study, site, paleo_data
self.file_url_to_datatable = {} # file_url -> dataTableID
# self.last_timing = {}
self.logger = logging.getLogger("pyleotups.Dataset")
self.logger = logging.getLogger("pyleotups.NOAADataset")


def _reindex(self):
Expand Down Expand Up @@ -92,9 +92,9 @@ def __add__(self, other):
except Exception:
check_same_study_content = False
if not check_same_study_content:
warnings.warn(
log.warning(
f"NOAADataset union: duplicate StudyID {sid} with differing content. "
"Keeping left-hand version. i.e. if C = A + B is perfomed, contents of A will be kept.", UserWarning
"Keeping left-hand version. i.e. if C = A + B is perfomed, contents of A will be kept."
)
# else identical content -> do nothing
else:
Expand All @@ -116,9 +116,9 @@ def __iadd__(self, other):
except Exception:
check_same_study_content = False
if not check_same_study_content:
warnings.warn(
log.warning(
f"Dataset in-place union: duplicate StudyID {sid} with differing content. "
"Keeping existing version. i.e. IF A = A + B is perfomed, contents of A will be kept", UserWarning
"Keeping existing version. i.e. IF A = A + B is perfomed, contents of A will be kept"
)
else:
self.studies[sid] = study
Expand Down Expand Up @@ -453,14 +453,15 @@ def search_studies(self, **kwargs):
if status == 204:
inv = payload.get("investigators")
if inv:
warnings.warn(
log.warning(
"No studies found for investigator(s): "
f"{inv}. NOAA expects 'LastName, Initials'. Try variations like:\n"
" - 'LastName, Initials'\n - 'LastName'\n - 'Initials'"
)
# Nothing to parse; return display summary (empty) or None
return self.get_summary()
# if kwargs.get("display") else None
log.info(f"Retrieved {len(self.studies)} studies.")
return self.get_summary()
# if ("display" in kwargs and kwargs.get("display")) else log.info(f"Retrieved {len(self.studies)} studies.")
# Non-204: ensure success and parse JSON

try:
Expand All @@ -470,9 +471,10 @@ def search_studies(self, **kwargs):

# Parse into internal structures (you already have this)
self._parse_response(response_json, kwargs.get("limit"))
log.info(f"Retrieved {len(self.studies)} studies.")

return self.get_summary()
# if kwargs.get("display") else log.info(f"Parsed {len(self.studies)} studies.")
# if ("display" in kwargs and kwargs.get("display")) else log.info(f"Retrieved {len(self.studies)} studies.")


def _parse_response(self, data, limit):
Expand Down Expand Up @@ -502,7 +504,7 @@ def _parse_response(self, data, limit):
self.file_url_to_datatable[file_url] = paleo.datatable_id

if isinstance(limit, int) and len(data.get('study', [])) >= limit:
warnings.warn(
log.warning(
f"Retrieved {limit} studies, which is the specified limit. "
"Consider increasing the limit parameter to fetch more studies."
)
Expand Down Expand Up @@ -600,7 +602,7 @@ def get_publications(self, save=False, path=None, verbose=False):
if not path:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M")
path = f"bibtex_{timestamp}.bib"
warnings.warn(f"No path specified. Saving BibTeX to: {path}")
log.warning(f"No path specified. Saving BibTeX to: {path}")

try:
writer = Writer()
Expand Down Expand Up @@ -825,64 +827,6 @@ def get_variables(self, dataTableIDs):
return pd.DataFrame(columns=["StudyID", "SiteID", "FileURL", "VariableName"]) # fallback for no data

return df.set_index("DataTableID")

@DeprecationWarning
def get_data_deprecated(self, dataTableIDs=None, file_urls=None):
"""
Fetch external data for given dataTableIDs or file URLs and attach study/site metadata.

Parameters
----------
dataTableIDs : list or str, optional
One or more NOAA data table IDs.
file_urls : list or str, optional
One or more file URLs.

Returns
-------
list of pandas.DataFrame
A list of DataFrames, each corresponding to fetched data.
"""

if dataTableIDs:
dataTableIDs = assert_list(dataTableIDs)
dfs = []
for dt_id in dataTableIDs:
mapping = self.data_table_index.get(dt_id)
if not mapping:
print(f"Data Table ID {dt_id} not found or no associated file URL.")
continue
file_url = mapping['paleo_data'].file_url
if not file_url:
print(f"No file URL for Data Table ID {dt_id}.")
continue
fetched_data = DataFetcher.fetch_data(file_url)
if isinstance(fetched_data, list):
for df in fetched_data:
df.attrs['NOAAStudyId'] = mapping['study_id']
df.attrs['SiteID'] = mapping['site_id']
study_obj = self.studies.get(mapping['study_id'], {})
df.attrs['StudyName'] = study_obj.metadata.get("studyName") if hasattr(study_obj, 'metadata') else None
publications = study_obj.publications if hasattr(study_obj, 'publications') else None
print(len(publications))
for pub in publications:
if hasattr(pub, "doi"):
doi = pub.doi if pub.doi else None
df.attrs['PublicationDOI'].append(doi)
dfs.append(df)
else:
fetched_data.attrs['NOAAStudyId'] = mapping['study_id']
fetched_data.attrs['SiteID'] = mapping['site_id']
study_obj = self.studies.get(mapping['study_id'], {})
fetched_data.attrs['StudyName'] = study_obj.metadata.get("studyName") if hasattr(study_obj, 'metadata') else None
dfs.append(fetched_data)
return dfs
if file_urls:
file_urls = assert_list(file_urls)
dfs = [DataFetcher.fetch_data(url) for url in file_urls]
return dfs
print("No dataTableID or file URL provided.")
return pd.DataFrame()


def _process_file(self, file_url, mapping=None):
Expand Down Expand Up @@ -1050,10 +994,6 @@ def get_data(self, dataTableIDs=None, file_urls=None):
dataTableIDs = assert_list(dataTableIDs)
for dt_id in dataTableIDs:

# print(self.data_table_index, type(self.data_table_index.values()))
# for id, value in self.data_table_index.items():
# print(type(id))
# print(value, type(value))
mapping = self.data_table_index.get(dt_id)
if not mapping:
raise ValueError(f"No parent study mapping found for Data Table ID '{dt_id}'. "
Expand All @@ -1070,18 +1010,13 @@ def get_data(self, dataTableIDs=None, file_urls=None):
for url in file_urls:
mapping = self.file_url_to_datatable.get(url)
if not mapping:
warnings.warn(
f"Attached '{url}' is not linked to any parent study; can not add metadata.",
UserWarning
)
log.warning(f"Attached '{url}' is not linked to any parent study; can not add metadata.")
dfs.extend(self._process_file(url))
else:
mapping_details = self.data_table_index.get(mapping)
if not mapping_details:
warnings.warn(
f"Mapping details for file URL '{url}' (Data Table ID '{mapping}') not found; can not add metadata.",
UserWarning
)
log.warning(
f"Mapping details for file URL '{url}' (Data Table ID '{mapping}') not found; can not add metadata.")
dfs.extend(self._process_file(url))
else:
dfs.extend(self._process_file(url, mapping_details))
Expand Down
21 changes: 12 additions & 9 deletions pyleotups/core/PangaeaDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

from ..utils.PangaeaStudy import PangaeaStudy

logging.getLogger("pangaeapy").setLevel(logging.ERROR)

logger = logging.getLogger(__name__)

# try to import pangaeapy; raise helpful error if missing
Expand Down Expand Up @@ -158,7 +160,7 @@ def search_studies(self,
display: if True, return get_summary() after populating registry

Returns:
None by default, or pandas.DataFrame (same shape as Dataset.get_summary()) if display=True.
pandas.DataFrame (same shape as Dataset.get_summary()).
"""
# Direct ID loading mode
if study_ids is not None:
Expand All @@ -168,10 +170,11 @@ def search_studies(self,

self._resolve_and_register_ids(study_ids)

if display:
return self.get_summary()
logger.info(f"Retrived {len(self.studies)} studies")

return
return self.get_summary()
# if display else logger.info(f"Retrived {len(self.studies)} studies")


# Query-based search
# build query string
Expand All @@ -183,7 +186,7 @@ def search_studies(self,
try:
pq = PanQuery(query=query_str, bbox=bbox, limit=limit, offset=offset)
except Exception as exc:
logger.exception("PanQuery failed")
logger.exception(f"PanQuery failed due to {exc}")
raise

# register results in self.studies but do not accumulate into a dataframe here
Expand All @@ -199,10 +202,10 @@ def search_studies(self,
auth_token=self.auth_token,
)


# Only return if user explicitly asked for display
if display:
return self.get_summary()
logger.info(f"Retrived {len(self.studies)} studies")

return self.get_summary()
# if display else logger.info(f"Retrived {len(self.studies)} studies")


# -------------------------
Expand Down
36 changes: 29 additions & 7 deletions pyleotups/tests/test_NOAADataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,18 +358,28 @@ def test_get_data_t03_from_file_url_with_mapping(self, mock_parser, mock_get):
# --- Test t04: file_url not in mapping, should still parse ---
@patch("pyleotups.core.NOAADataset.requests.get")
@patch("pyleotups.core.NOAADataset.StandardParser")
def test_get_data_t04_unmapped_file_url_warns_and_parses(self, mock_parser, mock_get):
def test_get_data_t04_unmapped_file_url_warns_and_parses(self, mock_parser, mock_get, caplog):
unmapped_url = "https://example.com/fake.txt"

mock_get.return_value.status_code = 200
mock_get.return_value.raise_for_status = lambda: None
mock_get.return_value.text = "# mock\n# mock\n# mock\n# mock\n# mock"

dummy_df = pd.DataFrame({"depth": [10, 20]})
mock_parser.return_value.parse.return_value = dummy_df

with pytest.warns(UserWarning, match="not linked to any parent study"):
# Capture logs at WARNING level
with caplog.at_level("WARNING"):
result = self.ds.get_data(file_urls=[unmapped_url])
assert isinstance(result[0], pd.DataFrame)

# Assert log message was emitted
assert any(
"not linked to any parent study" in record.message
for record in caplog.records
)

# Existing assertion
assert isinstance(result[0], pd.DataFrame)

# --- Test t05: file with unsupported extension ---
def test_get_data_t05_unsupported_file_type_raises(self):
Expand Down Expand Up @@ -486,7 +496,7 @@ def test_add_t02_same_id_identical_keeps_left_no_warning(self):
assert len(C.data_table_index) == len(A.data_table_index)
assert len(C.file_url_to_datatable) == len(A.file_url_to_datatable)

def test_add_t03_same_id_different_warns_and_keeps_left(self):
def test_add_t03_same_id_different_warns_and_keeps_left(self, caplog):
"""C = A + B where same NOAAStudyId but different content → warning; C looks like A."""
A = _build_NOAAdataset_for_noaa_id(18315)

Expand All @@ -497,7 +507,8 @@ def _mutate(study_dict):
B = _build_NOAAdataset_for_noaa_id(18315, mutate=_mutate)

# Expect a UserWarning mentioning duplicate/different study; keep regex loose and case-insensitive
with pytest.warns(UserWarning, match=r"(?i)duplicate.*study.*18315"):
# with pytest.warns(UserWarning, match=r"(?i)duplicate.*study.*18315"):
with caplog.at_level("WARNING"):
C = A + B

assert _ids(C) == {18315}
Expand All @@ -508,6 +519,11 @@ def _mutate(study_dict):
assert len(C.data_table_index) == len(A.data_table_index)
assert len(C.file_url_to_datatable) == len(A.file_url_to_datatable)

assert any(
"duplicate" in record.message.lower() and "study" in record.message.lower() and "18315" in record.message
for record in caplog.records
)


# ---------------------------------------------------------------------------
# Tests: A = A + B (rebinding variable name to result of binary add)
Expand Down Expand Up @@ -546,7 +562,7 @@ def test_add_rebind_t02_same_id_identical_no_warning(self):
assert len(A.data_table_index) == len(canonical_A.data_table_index)
assert len(A.file_url_to_datatable) == len(canonical_A.file_url_to_datatable)

def test_add_rebind_t03_same_id_different_warns_and_keeps_left(self):
def test_add_rebind_t03_same_id_different_warns_and_keeps_left(self, caplog):
"""A = A + B where same NOAAStudyId but different content → warning; A still looks like original A."""
A = _build_NOAAdataset_for_noaa_id(18315)

Expand All @@ -555,7 +571,8 @@ def _mutate(study_dict):

B = _build_NOAAdataset_for_noaa_id(18315, mutate=_mutate)

with pytest.warns(UserWarning, match=r"(?i)duplicate.*study.*18315"):
# with pytest.warns(UserWarning, match=r"(?i)duplicate.*study.*18315"):
with caplog.at_level("WARNING"):
A = A + B

assert _ids(A) == {18315}
Expand All @@ -566,3 +583,8 @@ def _mutate(study_dict):
assert _ids(A) == _ids(canonical_A)
assert len(A.data_table_index) == len(canonical_A.data_table_index)
assert len(A.file_url_to_datatable) == len(canonical_A.file_url_to_datatable)

assert any(
"duplicate" in record.message.lower() and "study" in record.message.lower() and "18315" in record.message
for record in caplog.records
)
23 changes: 23 additions & 0 deletions pyleotups/utils/NOAAStudy.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def __init__(self, study_data):
"Malformed site entry encountered. Original error: "
f"{str(e)}"
)
self.coverage = self._compute_coverage()

def _load_metadata(self, study_data):
"""
Expand Down Expand Up @@ -130,6 +131,27 @@ def _load_funding(self, study_data):
for f in funding_info if isinstance(f, dict)
]
return []

def _compute_coverage(self):
south_vals = []
north_vals = []
west_vals = []
east_vals = []

for site in self.sites:
if not np.isnan(site.south_lat):
south_vals.append(site.south_lat)
if not np.isnan(site.north_lat):
north_vals.append(site.north_lat)
if not np.isnan(site.west_lon):
west_vals.append(site.west_lon)
if not np.isnan(site.east_lon):
east_vals.append(site.east_lon)

if not south_vals or not north_vals or not west_vals or not east_vals:
return (np.nan, np.nan, np.nan, np.nan)

return (min(south_vals), max(north_vals), min(west_vals), max(east_vals))


def to_dict(self):
Expand All @@ -151,6 +173,7 @@ def to_dict(self):
"MostRecentYearBP": self.metadata.get("mostRecentYearBP"),
"EarliestYearCE": self.metadata.get("earliestYearCE"),
"MostRecentYearCE": self.metadata.get("mostRecentYearCE"),
"Coverage [S, N, W, E]": self.coverage,
"StudyNotes": self.metadata.get("studyNotes"),
"ScienceKeywords": self.metadata.get("scienceKeywords"),
"Investigators": self.investigators,
Expand Down
Loading
Loading