diff --git a/pyleotups/core/NOAADataset.py b/pyleotups/core/NOAADataset.py index ff3f20ca..636a1f88 100644 --- a/pyleotups/core/NOAADataset.py +++ b/pyleotups/core/NOAADataset.py @@ -11,7 +11,7 @@ from ..utils.Parser.StandardParser import DataFetcher, StandardParser from ..utils.Parser.NonStandardParser import NonStandardParser from ..utils.api.constants import BASE_URL -from ..utils.api.query_builder import build_payload +from ..utils.api.query_builder import build_noaa_payload from ..utils.api.http import get @@ -185,11 +185,11 @@ def search_studies(self, **kwargs): species_and_or : {"and","or"}, default "or" Logical combiner for multiple species. Only sent when 2+ items. - cv_whats : str or list[str], optional - PaST “What” terms (hierarchies with ``>``). Lists joined with ``|``. + variable_name : str or list[str], optional + Refers to PaST "cvWhats” terms (hierarchies with ``>``). Lists joined with ``|``. - cv_whats_and_or : {"and","or"}, default "or" - Logical combiner for multiple cv_whats. Only sent when 2+ items. + variable_name_and_or : {"and","or"}, default "or" + Logical combiner for multiple cvWhats/variable_name. Only sent when 2+ items. cv_materials : str or list[str], optional PaST “Material” terms (hierarchies with ``>``). Lists joined with ``|``. @@ -256,7 +256,7 @@ def search_studies(self, **kwargs): ----- User Guide: - **Multi-value fields.** For ``investigators``, ``locations``, ``keywords``, ``species``, ``cv_whats``, + **Multi-value fields.** For ``investigators``, ``locations``, ``keywords``, ``species``, ``variable_name`` (cvWhats), ``cv_materials``, ``cv_seasonalities``: - Accept a string (already ``|``-separated) **or** a Python list of strings. - Lists are joined with ``|``. The corresponding ``*_and_or`` flag is included only when 2+ items. @@ -281,7 +281,7 @@ def search_studies(self, **kwargs): .. jupyter-execute:: import pyleotups as pt - ds = pt.Dataset() + ds = pt.NOAADataset() df_noaa = ds.search_studies(noaa_id=13156) df_xml = ds.search_studies(xml_id=1840) df_noaa.head() @@ -291,26 +291,26 @@ def search_studies(self, **kwargs): ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. jupyter-execute:: - # Single phrase + ### Single phrase df_singlephrase = ds.search_studies(search_text="younger dryas", limit=20) df_singlephrase.head() .. jupyter-execute:: - # Logical operator (AND) + ### Logical operator (AND) df_logop = ds.search_studies(search_text="loess AND stratigraphy", limit=20) df_logop.head() .. jupyter-execute:: - # Wildcards: '_' (single char), '%' (multi-char) + ### Wildcards: '_' (single char), '%' (multi-char) df_wc_1 = ds.search_studies(search_text="f_re", limit=20) df_wc_2 = ds.search_studies(search_text="pol%", limit=20) df_wc_1.head(), df_wc_2.head() .. jupyter-execute:: - # Escaping special characters (use backslashes) + ### Escaping special characters (use backslashes) df_specchar = ds.search_studies(search_text=r"noaa\-tree\-19260", limit=20) df_specchar.head() @@ -319,25 +319,25 @@ def search_studies(self, **kwargs): .. jupyter-execute:: - # Multiple investigators (OR by default) + ### Multiple investigators (OR by default) df_multinv_default = ds.search_studies(investigators=["Wahl, E.R.", "Vose, R.S."]) df_multinv_default.head() .. jupyter-execute:: - # Multiple investigators (AND by default) + ### Multiple investigators (AND by default) df_multinv_and = ds.search_studies(investigators=["Wahl, E.R.", "Vose, R.S."], investigatorsAndOr = "and") df_multinv_and.head() .. jupyter-execute:: - # Keywords: hierarchy with '>' and multiple via '|' + ### Keywords: hierarchy with '>' and multiple via '|' df_keywords = ds.search_studies(keywords="earth science>paleoclimate>paleocean>biomarkers") df_keywords.head() .. jupyter-execute:: - # Location hierarchy + ### Location hierarchy df_loc = ds.search_studies(locations="Continent>Africa>Eastern Africa>Zambia") df_loc.head() @@ -345,13 +345,13 @@ def search_studies(self, **kwargs): ^^^^^^^^^^^^^^^^^ .. jupyter-execute:: - # Species: four-letter codes (uppercase enforced) + ### Species: four-letter codes (uppercase enforced) df_species = ds.search_studies(species=["ABAL", "PIPO"]) df_species.head() .. jupyter-execute:: - # Data types: one or more IDs separated by '|' + ### Data types: one or more IDs separated by '|' df_muldatatypes = ds.search_studies(data_type_id="4|18") df_muldatatypes.head() @@ -371,13 +371,13 @@ def search_studies(self, **kwargs): ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. jupyter-execute:: - # Explicit BP with method + ### Explicit BP with method df_timew = ds.search_studies(earliest_year=12000, time_format="BP", time_method="overAny") df_timew.head() .. jupyter-execute:: - # No time_format/time_method → defaults to CE + ### No time_format/time_method → defaults to CE df_time_defualt = ds.search_studies(earliest_year=1500, latest_year=0) df_time_defualt.head() @@ -397,11 +397,11 @@ def search_studies(self, **kwargs): ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. jupyter-execute:: - # Limit up to first 10 results + ### Limit up to first 10 results df_limit = ds.search_studies(earliest_year=12000, time_format="BP", time_method="overAny", limit=10) df_limit.head() - # Skip the first 10 results (i.e., get results 11-20) + ### Skip the first 10 results (i.e., get results 11-20) df_skip = ds.search_studies(earliest_year=12000, time_format="BP", time_method="overAny", limit=10, skip=10) df_skip.head() """ @@ -419,14 +419,14 @@ def search_studies(self, **kwargs): kwargs.get("location") or kwargs.get("locations"), kwargs.get("publication"), kwargs.get("search_text"), kwargs.get("earliest_year"), kwargs.get("latest_year"), - kwargs.get("cv_whats"), kwargs.get("min_elevation"), + kwargs.get("variable_name"), kwargs.get("min_elevation"), kwargs.get("max_elevation"), kwargs.get("time_format"), kwargs.get("time_method"), kwargs.get("reconstruction"), kwargs.get("species"), kwargs.get("recent"), kwargs.get("skip") ]): raise ValueError( "At least one search parameter must be specified to initiate a query. " - "To view available parameters and usage examples, run: help(Dataset.search_studies)" + "To view available parameters and usage examples, run: help(NOAADataset.search_studies)" ) if kwargs.get("data_publisher") and kwargs["data_publisher"] != "NOAA": @@ -435,8 +435,11 @@ def search_studies(self, **kwargs): "Please retry with data_publisher='NOAA'." ) + if kwargs.get("cv_whats") and not kwargs.get("variable_name"): + kwargs["variable_name"] = kwargs.pop("cv_whats") + # Build payload using our utils (handles ids short-circuit, list→'|', Y/N coercion, time default) - payload, notes = build_payload(**kwargs) + payload, notes = build_noaa_payload(**kwargs) for n in notes: log.info("search_studies: %s", n) self.last_search_notes = notes diff --git a/pyleotups/core/PangaeaDataset.py b/pyleotups/core/PangaeaDataset.py index f7b14a8a..123b423c 100644 --- a/pyleotups/core/PangaeaDataset.py +++ b/pyleotups/core/PangaeaDataset.py @@ -17,6 +17,8 @@ from ..utils.PangaeaStudy import PangaeaStudy +from ..utils.api.query_builder import build_pangaea_query + logging.getLogger("pangaeapy").setLevel(logging.ERROR) logger = logging.getLogger(__name__) @@ -38,7 +40,7 @@ class PangaeaDataset(BaseDataset): PangaeaDataset: lightweight provider that mirrors pyleotups.core.Dataset responses. Notes: - - search_studies(q=..., bbox=..., keywords=..., limit=..., offset=...) registers studies + - search_studies(**kwargs) registers studies in self.studies (StudyID -> {'panobj': PanDataSet|None, 'summary': normalized_dict}) - get_summary() returns a pandas.DataFrame exactly matching NOAA Dataset.to_dict() column names. - get_publications(), get_geo(), get_funding() return DataFrames with the same column names @@ -124,8 +126,7 @@ def _resolve_and_register_ids(self, study_ids): else: # Not in registry, not in collection → direct load logger.info( - f"Study {sid} not previously registered. " - f"Loading ad hoc." + f"Registering Study {sid} via direct lookup." ) self.studies[sid] = PangaeaStudy( study_id=sid, @@ -139,12 +140,13 @@ def _resolve_and_register_ids(self, study_ids): # search_studies: q, bbox, keywords -> registers studies and returns same style as Dataset.search_studies (DataFrame) # ------------------------- def search_studies(self, - q: Optional[str] = None, - study_ids: Optional[Union[int, str, List]] = None, - bbox: Optional[Tuple[float, float, float, float]] = None, - limit: int = 10, - offset: int = 0, - display: bool = False) -> Optional[pd.DataFrame]: + # q: Optional[str] = None, + # study_ids: Optional[Union[int, str, List]] = None, + # bbox: Optional[Tuple[float, float, float, float]] = None, + # limit: int = 10, + # offset: int = 0, + # display: bool = False + **kwargs) -> Optional[pd.DataFrame]: """ Search PANGAEA and register results in self.studies. @@ -153,38 +155,230 @@ def search_studies(self, - Does NOT return the DataFrame by default (returns None). - If display=True, returns the full normalized summary DataFrame from self.get_summary(). - Args: - q: free-text query - bbox: geographical bounding box (minlon,minlat,maxlon,maxlat) - limit, offset: paging - display: if True, return get_summary() after populating registry + Search for PANGAEA datasets using unified PyleoTUPS query parameters. + + This method translates user-friendly query parameters into a PANGAEA-compatible + search query and registers the resulting datasets internally. + + Parameters + ---------- + study_ids : int, str, or list, optional + One or more PANGAEA dataset identifiers (numeric ID or DOI string). + If provided, performs direct lookup and ignores other filters. + + topic : str, optional + Filter datasets by PANGAEA topic classification. + Must be one of the predefined topics: + - "all" (default) + - "Agriculture" + - "Atmosphere" + - "Biological Classification" + - "Biosphere" + - "Chemistry" + - "Cryosphere" + - "Ecology" + - "Fisheries" + - "Geophysics" + - "Human Dimensions" + - "Lakes & Rivers" + - "Land Surface" + - "Lithosphere" + - "Oceans" + - "Paleontology" + If set to "all" or omitted, no topic filtering is applied. + + search_text : str, optional + Free-text search query applied across dataset metadata. Maps to PANGAEA full-text search parameter 'q'. + Example: 'stable carbon and oxygen isotopes'. + + investigators : str or list[str], optional + Author names. Mapped internally to PANGAEA query syntax: + ``author:`` + + variable_name : str or list[str], optional + Name of parameters/variables (columns) present in dataset tables. + Internally mapped to PANGAEA query term: + ``parameter:`` + + min_lat, max_lat : float, optional + Latitude bounds (–90..90). + + min_lon, max_lon : float, optional + Longitude bounds (–180..180) + + limit : int, default 100, maximum 500 + Maximum number of results returned. + + skip : int, default 0 + Number of results to skip (pagination). Maps to PANGAEA 'offset' + + + Returns + ------- + pandas.DataFrame + DataFrame summarizing matched datasets. Also populates internal registry. + + Raises + ------ + ValueError + If no valid search parameters are provided. + + Notes + ----- + + PANGAEA search is text-based and less structured than NOAA filters. + Results may vary depending on metadata completeness. + + **Unified query interface.** + PyleoTUPS uses consistent parameter names across datasets: + + - ``variable_name`` → mapped to ``parameter:`` in PANGAEA + - ``investigators`` → mapped to ``author:`` + + **Query construction.** + If ``q`` is not provided, a query string is constructed by combining: + - search_text + - investigators + - variable_name + - keywords + + **Geospatial filtering.** + Bounding box requires all four parameters: + ``min_lat, max_lat, min_lon, max_lon``. + Partial inputs are ignored. + + **Identifier priority.** + If ``study_ids`` is provided, all other filters are ignored. + + **Multi-value parameters.** + Multiple values for parameters like `variable_name` or `investigators` + are combined into a space-separated query, interpreted as logical AND + by the PANGAEA search engine. + + Examples + -------- + + Quick Start - Identifier Based search + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + .. jupyter-execute:: + + + import pyleotups as pt + ds = pt.PangaeaDataset() + + + ### Can use either DOI strings or numeric IDs (extracted from DOIs) + df = ds.search_studies( + study_ids=["10.1594/PANGAEA.830587", "10.1594/PANGAEA.830588"] + ) + df.head() + + df = ds.search_studies( + study_ids=[830587, 830588] + ) + df.head() + + + Basic search + ^^^^^^^^^^^^ + + .. jupyter-execute:: + + df = ds.search_studies(search_text="Stable oxygen and carbon isotopes", limit = 5) + df.head() + + Variable-based search + ^^^^^^^^^^^^^^^^^^^^^ + + .. jupyter-execute:: + + df = ds.search_studies(variable_name=["Pulleniatina obliquiloculata δ13C", "Pulleniatina obliquiloculata δ18O"], limit = 5) + df.head() + + Investigator/Author-based search + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + .. jupyter-execute:: + + df = ds.search_studies(investigators=["Khider, D"], limit = 5) + df.head() + + Combined filters + ^^^^^^^^^^^^^^^^ + + .. jupyter-execute:: + + df = ds.search_studies( + search_text="Stable oxygen and carbon isotopes", + variable_name=["Pulleniatina obliquiloculata δ13C", "Pulleniatina obliquiloculata δ18O"], + investigators="Khider, D", + limit = 5 + ) + df.head() + + Geographic filtering + ^^^^^^^^^^^^^^^^^^^^ - Returns: - pandas.DataFrame (same shape as Dataset.get_summary()). + .. jupyter-execute:: + + df = ds.search_studies( + min_lat=-10, max_lat=10, + min_lon=120, max_lon=160 + ) + df.head() """ - # Direct ID loading mode + study_ids = kwargs.get("study_ids") + + # ------------------------------------------- + # MODE 1: STUDY IDS (HIGHEST PRIORITY) + # ------------------------------------------- if study_ids is not None: - if q is not None: - raise ValueError("Provide either 'q' or 'study_ids', not both.") + # Prevent mixing modes + if any([ + kwargs.get("search_text"), + kwargs.get("investigators"), + kwargs.get("variable_name"), + kwargs.get("min_lat"), + kwargs.get("max_lat"), + kwargs.get("min_lon"), + kwargs.get("max_lon"), + ]): + logger.warning( + "Using identifier-only fetch (Pangaea DOI). Other parameters will be ignored.." + ) - self._resolve_and_register_ids(study_ids) + self._resolve_and_register_ids(kwargs.get("study_ids")) logger.info(f"Retrived {len(self.studies)} studies") return self.get_summary() - # if display else logger.info(f"Retrived {len(self.studies)} studies") - + # if display else logger.info(f"Retrived {len(self.studies)} studies") + + if not any([ + kwargs.get("search_text"), + kwargs.get("investigators"), + kwargs.get("variable_name"), + kwargs.get("min_lat"), + kwargs.get("max_lat"), + kwargs.get("min_lon"), + kwargs.get("max_lon"), + ]): + raise ValueError( + "At least one search parameter must be specified to initiate a query. " + "To view available parameters and usage examples, run: help(PangaeaDataset.search_studies)" + ) - # Query-based search - # build query string - q_parts = [] - if q: - q_parts.append(q) - query_str = " ".join(q_parts).strip() or "" + params = build_pangaea_query(**kwargs) + print(params) try: - pq = PanQuery(query=query_str, bbox=bbox, limit=limit, offset=offset) + pq = PanQuery( + query = params["q"], + bbox = params["bbox"], + limit = params["limit"], + offset = params["offset"]) except Exception as exc: logger.exception(f"PanQuery failed due to {exc}") raise @@ -470,34 +664,3 @@ def get_variables(self, study_ids=None) -> pd.DataFrame: return pd.DataFrame() return pd.concat(frames, ignore_index=True) - - -if __name__ == "__main__": - - pg = PangaeaDataset() - - # Search (returns DataFrame) - pg.search_studies(q="coral aragonite", bbox=(-180,-90,180,90), keywords=["Sr/Ca"], limit=10) - - hits_df = pg.search_studies(q="coral aragonite", bbox=(-180,-90,180,90), keywords=["Sr/Ca"], limit=10) - - print(hits_df.shape) - print(hits_df.columns) - # register is automatically populated - print("Registry size:", len(pg.studies)) - - # Pick an id from hits and fetch summary (DataFrame single-row) - ident = hits_df.iloc[0]["id"] - summary_df = pg.get_summary() - print(summary_df.T) # pretty inspect - - # get_geo (DataFrame or empty DataFrame) - geo_df = pg.get_geo() - print(geo_df) - - # get_publications -> DataFrame - pubs_df = pg.get_publications() - print(pubs_df) - # get_data -> DataFrame (parsed table) - data_df = pg.get_data() # uses PanDataSet and returns pandas.DataFrame - print(data_df) # provenance diff --git a/pyleotups/tests/test_PangaeaDataset.py b/pyleotups/tests/test_PangaeaDataset.py index 4817bff3..d037413b 100644 --- a/pyleotups/tests/test_PangaeaDataset.py +++ b/pyleotups/tests/test_PangaeaDataset.py @@ -59,7 +59,7 @@ def test_search_and_summary_with_collection(self, mock_panquery, mock_requests, ] ds = PangaeaDataset() - ds.search_studies(q="test") + ds.search_studies(search_text="test") assert 830589 in ds.studies @@ -87,7 +87,7 @@ def test_get_data_collection_warns(self, mock_panquery, mock_requests, caplog): ] ds = PangaeaDataset() - ds.search_studies(q="test") + ds.search_studies(search_text="test") with caplog.at_level("WARNING"): df = ds.get_data(830589) @@ -109,7 +109,7 @@ def test_get_data_collection_member_auto_register(self, mock_panquery, mock_requ ] ds = PangaeaDataset() - ds.search_studies(q="test") + ds.search_studies(search_text="test") # Child of 830589 @@ -133,7 +133,7 @@ def test_temporal_ce_only(self, mock_panquery, mock_requests): ] ds = PangaeaDataset() - ds.search_studies(q="test") + ds.search_studies(search_text="test") summary = ds.get_summary() row = summary.iloc[0] @@ -152,7 +152,7 @@ def test_temporal_bp_only(self, mock_panquery, mock_requests): ] ds = PangaeaDataset() - ds.search_studies(q="test") + ds.search_studies(search_text="test") summary = ds.get_summary() row = summary.iloc[0] @@ -171,7 +171,7 @@ def test_temporal_ce_and_bp(self, mock_panquery, mock_requests): ] ds = PangaeaDataset() - ds.search_studies(q="test") + ds.search_studies(search_text="test") summary = ds.get_summary() row = summary.iloc[0] @@ -190,7 +190,7 @@ def test_temporal_no_age(self, mock_panquery, mock_requests): ] ds = PangaeaDataset() - ds.search_studies(q="test") + ds.search_studies(search_text="test") summary = ds.get_summary() row = summary.iloc[0] @@ -212,7 +212,7 @@ def test_get_variables(self, mock_panquery, mock_requests): ] ds = PangaeaDataset() - ds.search_studies(q="test") + ds.search_studies(search_text="test") df_vars = ds.get_variables(830586) @@ -235,7 +235,7 @@ def test_get_funding(self, mock_panquery, mock_requests): ] ds = PangaeaDataset() - ds.search_studies(q="test") + ds.search_studies(search_text="test") df_funding = ds.get_funding() @@ -252,7 +252,7 @@ def test_get_publications(self, mock_panquery, mock_requests): ] ds = PangaeaDataset() - ds.search_studies(q="test") + ds.search_studies(search_text="test") bib, df_pub = ds.get_publications() diff --git a/pyleotups/utils/api/query_builder.py b/pyleotups/utils/api/query_builder.py index c9c50dff..0308bf2a 100644 --- a/pyleotups/utils/api/query_builder.py +++ b/pyleotups/utils/api/query_builder.py @@ -17,13 +17,13 @@ ("locations", "locations", "locations_and_or", "locationsAndOr", normalize_passthrough), ("keywords", "keywords", "keywords_and_or", "keywordsAndOr", normalize_passthrough), ("species", "species", "species_and_or", "speciesAndOr", normalize_species_code), - ("cv_whats", "cvWhats", "cv_whats_and_or", "cvWhatsAndOr", normalize_passthrough), + ("variable_name", "cvWhats", "variable_name_and_or", "cvWhatsAndOr", normalize_passthrough), ("cv_materials", "cvMaterials", "cv_materials_and_or", "cvMaterialsAndOr", normalize_passthrough), ("cv_seasonalities", "cvSeasonalities", "cv_seasonalities_and_or", "cvSeasonalitiesAndOr", normalize_passthrough), ] -def build_payload(**kwargs) -> Tuple[dict, List[str]]: +def build_noaa_payload(**kwargs) -> Tuple[dict, List[str]]: """ Normalize user kwargs (Pythonic names) into NOAA study search payload (camelCase). Returns (payload, notes). 'notes' contains human-readable info about defaults/normalizations. @@ -122,3 +122,179 @@ def build_payload(**kwargs) -> Tuple[dict, List[str]]: payload["recent"] = "true" return payload, notes + +# ------------------------------------------------------- +# PANGAEA QUERY BUILDER +# ------------------------------------------------------- + + +def _build_logical_block(field_name, values, operator, formatter): + if not values: + return None + + if not isinstance(values, (list, tuple, set)): + values = [values] + + values = [v for v in values if v] + + if not values: + return None + + parts = [formatter(v) for v in values] + + if len(parts) == 1: + return parts[0] + + + return f"({' OR '.join(parts)})" if operator.lower() == "or" else f"{' '.join(parts)}" # implicit AND + + +def build_pangaea_query(**kwargs): + """ + Translate NOAA-style kwargs into PANGAEA query parameters. + + Returns + ------- + dict + { + "q": str, + "bbox": tuple or None, + "limit": int, + "offset": int + } + """ + + parts = [] + + # --------------------------------------------------- + # GEO HANDLING (ALWAYS handled, even if q is provided) + # --------------------------------------------------- + min_lat = kwargs.get("min_lat") + max_lat = kwargs.get("max_lat") + min_lon = kwargs.get("min_lon") + max_lon = kwargs.get("max_lon") + + # ----------------------------------------------- + # topic → topic: + # ----------------------------------------------- + VALID_TOPICS = { + "agriculture", "atmosphere", "biological classification", + "biosphere", "chemistry", "cryosphere", "ecology", + "fisheries", "geophysics", "human dimensions", + "lakes & rivers", "land surface", "lithosphere", + "oceans", "paleontology" + } + + topic = kwargs.get("topic") + + if topic: + # normalize to list + if not isinstance(topic, (list, tuple, set)): + topic = [topic] + + if topic: + # validate + normalized_topics = [] + invalid = [] + for t in topic: + key = str(t).strip().lower() + if key in VALID_TOPICS: + normalized_topics.append(t) # use normalized + elif key != "all": + invalid.append(t) + if invalid: + log.warning( + f"Invalid topic(s) found. Skipping: {invalid}. " + f"Please select from available topics: {sorted(VALID_TOPICS)}" + ) + + # build query block + block = _build_logical_block( + "topic", + normalized_topics, + kwargs.get("topic_and_or", "or"), + lambda v: f"topic:{v}" + ) + + if block: + parts.append(block) + + geo_params = [min_lat, max_lat, min_lon, max_lon] + + if any(v is not None for v in geo_params): + if not all(v is not None for v in geo_params): + log.warning( + "Incomplete geographic bounds provided. " + "PANGAEA requires min_lat, max_lat, min_lon, max_lon together. " + "Ignoring geographic filter." + ) + bbox = None + else: + bbox = (min_lon, min_lat, max_lon, max_lat) + else: + bbox = None + + + # ----------------------------------------------- + # search_text → raw query + # ----------------------------------------------- + if kwargs.get("search_text"): + parts.append(str(kwargs["search_text"])) + + # ----------------------------------------------- + # investigators → author: + # ----------------------------------------------- + block = _build_logical_block( + "investigators", + kwargs.get("investigators"), + kwargs.get("investigators_and_or", "and"), + lambda v: f"author:{v}" + ) + + if block: + parts.append(block) + + # ----------------------------------------------- + # variables → parameter: + # ----------------------------------------------- + block = _build_logical_block( + "variable_name", + kwargs.get("variable_name"), + kwargs.get("variable_name_and_or", "and"), + lambda v: f"parameter:{v}" + ) + + if block: + parts.append(block) + + + # ----------------------------------------------- + # final query string + # ----------------------------------------------- + q = " ".join(parts).strip() + + if not q and not bbox: + raise ValueError( + "At least one valid (non-null) search parameter or geographic bound must be provided to build a query." + "To view available parameters and usage examples, run: help(PangaeaDataset.search_studies)" + ) + + + + # --------------------------------------------------- + # LIMIT / OFFSET + # --------------------------------------------------- + limit = kwargs.get("limit", 100) + if limit > 500: + log.warning("Limit exceeds maximum allowed (500). Using 500.") + limit = 500 + else: + log.info(f"Limit set to {limit}") + offset = kwargs.get("skip", 0) + + return { + "q": q, + "bbox": bbox, + "limit": limit, + "offset": offset, + } \ No newline at end of file