bio-tools
diff --git a/‎build/lib/biotoolsllmannotate/__main__.py‎
Lines changed: 39 additions & 14 deletions b/‎build/lib/biotoolsllmannotate/__main__.py‎
Lines changed: 39 additions & 14 deletions
diff --git a/‎build/lib/biotoolsllmannotate/assess/scorer.py‎
Lines changed: 44 additions & 64 deletions b/‎build/lib/biotoolsllmannotate/assess/scorer.py‎
Lines changed: 44 additions & 64 deletions
diff --git a/‎build/lib/biotoolsllmannotate/cli/main.py‎
Lines changed: 83 additions & 27 deletions b/‎build/lib/biotoolsllmannotate/cli/main.py‎
Lines changed: 83 additions & 27 deletions
@@ -21,8 +21,20 @@ def _fallback_main() -> None:
     p_run.add_argument("--from-date")
     p_run.add_argument("--to-date")
     p_run.add_argument("--min-score", type=float)
-    p_run.add_argument("--min-bio-score", type=float)
-    p_run.add_argument("--min-doc-score", type=float)
+    p_run.add_argument(
+        "--min-bio-score-add",
+        "--min-bio-score",
+        dest="min_bio_score_add",
+        type=float,
+    )
+    p_run.add_argument("--min-bio-score-review", type=float)
+    p_run.add_argument(
+        "--min-doc-score-add",
+        "--min-doc-score",
+        dest="min_doc_score_add",
+        type=float,
+    )
+    p_run.add_argument("--min-doc-score-review", type=float)
     p_run.add_argument("--limit", type=int)
     p_run.add_argument("--dry-run", action="store_true")
     p_run.add_argument("--model", default="llama3.2")
@@ -34,22 +46,35 @@ def _fallback_main() -> None:
         return
     if args.command == "run":
         from_date = args.from_date or "7d"
-        min_bio_score = args.min_bio_score
-        min_doc_score = args.min_doc_score
+        bio_add = args.min_bio_score_add
+        bio_review = args.min_bio_score_review
+        doc_add = args.min_doc_score_add
+        doc_review = args.min_doc_score_review
+
         if args.min_score is not None:
-            if min_bio_score is None:
-                min_bio_score = args.min_score
-            if min_doc_score is None:
-                min_doc_score = args.min_score
-        if min_bio_score is None:
-            min_bio_score = 0.6
-        if min_doc_score is None:
-            min_doc_score = 0.6
+            if bio_add is None:
+                bio_add = args.min_score
+            if bio_review is None:
+                bio_review = args.min_score
+            if doc_add is None:
+                doc_add = args.min_score
+            if doc_review is None:
+                doc_review = args.min_score
+
+        bio_add = 0.6 if bio_add is None else max(0.0, min(bio_add, 1.0))
+        doc_add = 0.6 if doc_add is None else max(0.0, min(doc_add, 1.0))
+        bio_review = 0.5 if bio_review is None else max(0.0, min(bio_review, 1.0))
+        doc_review = 0.5 if doc_review is None else max(0.0, min(doc_review, 1.0))
+
+        if bio_review > bio_add:
+            bio_review = bio_add
+        if doc_review > doc_add:
+            doc_review = doc_add
         execute_run(
             from_date=from_date,
             to_date=args.to_date,
-            min_bio_score=min_bio_score,
-            min_doc_score=min_doc_score,
+            bio_thresholds=(bio_review, bio_add),
+            doc_thresholds=(doc_review, doc_add),
             limit=args.limit,
             dry_run=args.dry_run,
             model=args.model,
 
@@ -3,7 +3,7 @@
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 from .ollama_client import OllamaClient, OllamaConnectionError
-from biotoolsllmannotate.config import get_config_yaml
+from biotoolsllmannotate.config import DEFAULT_CONFIG_YAML, get_config_yaml
 from biotoolsllmannotate.enrich import is_probable_publication_url
 
 
@@ -283,6 +283,43 @@ def _score_from_response(
     return clamp_score(averaged), breakdown
 
 
+def _documentation_score_v2(breakdown, fallback: float | None) -> float:
+    weights = {"B1": 2.0, "B2": 1.0, "B3": 1.0, "B4": 1.0, "B5": 2.0}
+    denominator = sum(weights.values())
+
+    if isinstance(breakdown, Mapping):
+        numerator = 0.0
+        have_any = False
+        for key, weight in weights.items():
+            raw = breakdown.get(key)
+            if raw is not None:
+                have_any = True
+            value = _coerce_float(raw)
+            if value is None:
+                value = 0.0
+            numerator += clamp_score(value) * weight
+        if have_any:
+            return clamp_score(numerator / denominator)
+
+    if isinstance(breakdown, Sequence) and not isinstance(breakdown, (str, bytes)):
+        items = list(breakdown)
+        numerator = 0.0
+        have_any = False
+        for idx, key in enumerate(weights):
+            raw = items[idx] if idx < len(items) else None
+            if raw is not None:
+                have_any = True
+            value = _coerce_float(raw)
+            if value is None:
+                value = 0.0
+            numerator += clamp_score(value) * list(weights.values())[idx]
+        if have_any:
+            return clamp_score(numerator / denominator)
+
+    fallback_value = fallback if fallback is not None else 0.0
+    return clamp_score(fallback_value)
+
+
 def _candidate_homepage(candidate: dict) -> str:
     homepage = candidate.get("homepage")
     if isinstance(homepage, str):
@@ -445,74 +482,17 @@ def score_candidate(self, candidate: Dict[str, Any]) -> Dict[str, Any]:
         }
         result["bio_subscores"] = bio_breakdown or {}
         result["documentation_subscores"] = doc_breakdown or {}
+        doc_score_v2 = _documentation_score_v2(doc_breakdown, doc_score)
+        if doc_score_v2 != doc_score:
+            result["documentation_score_raw"] = doc_score
+        result["doc_score_v2"] = doc_score_v2
+        result["documentation_score"] = doc_score_v2
         return result
 
     def _build_prompt(self, candidate: dict) -> str:
         template = self.config.get("scoring_prompt_template")
         if not template:
-            template = """You are evaluating whether a software resource is worth getting registered in bio.tools, the registry for software resources in the life sciences.
-
-Available material:
-
-Title: {title}
-Description: {description}
-Homepage: {homepage}
-Homepage status: {homepage_status}
-Homepage error: {homepage_error}
-Documentation links: {documentation}
-Documentation keywords found on homepage: {documentation_keywords}
-Repository: {repository}
-Found keywords: {tags}
-Published: {published_at}
-Publication abstract: {publication_abstract}
-Publication full text: {publication_full_text}
-Known publication identifiers: {publication_ids}
-
-Task:
-Score the resource using the rubric below. For every subcriterion assign exactly one of {{0, 0.5, 1}}. Base every decision only on the provided material. Do not invent facts or URLs. If the resource is not life-science software, set ALL bio subcriteria A1–A5 = 0 and explain why in the rationale.
-
-Bio score rubric
-A1 Biological intent stated (explicit life-science task/domain).
-A2 Operations on biological data described
-A3 Software with biological data I/O: 0 = none; 0.5 = only generic; 1 = concrete datatypes/formats named.
-A4 Modality explicitly classifiable as one or more of: database portal, desktop application, web application, web API, web service, SPARQL endpoint, command-line tool (CLI), workbench, suite, plug-in, workflow, library, ontology. Include minimal usage context.
-A5 Evidence of bio use (examples on real bio data OR peer-reviewed/benchmark citation).
-
-Documentation score rubric (subcriteria only; no overall score here)
-B1 Documentation completeness (e.g. manual, guide, readthedocs).
-B2 Installation pathways (e.g. installation/setup, config, container, package).
-B3 Reproducibility aids (e.g. doi, release).
-B4 Maintenance signal (e.g. commits, issue tracker, news).
-B5 Onboarding & support (e.g. quickstart/tutorial, contact, faq).
-
-Selection/normalization rules:
-
-Base every decision on the supplied material only.
-Normalize publication identifiers to prefixes: DOI:..., PMID:..., PMCID:... and remove duplicates (case-insensitive).
-For any subcriterion scored 0 due to missing evidence, mention "insufficient evidence: <item>" in the rationale.
-Record each bio subcriterion as numbers {{0,0.5,1}} in `bio_subscores` and each documentation subcriterion as numbers {{0,0.5,1}} in `documentation_subscores`.
-Provide `confidence_score` as a number between 0 and 1 summarizing your certainty in the assessment (higher means more confident).
-Do NOT compute aggregate scores; only fill the provided fields.
-Do not output any value outside [0.0, 1.0].
-Always emit every field in the output JSON exactly once.
-Emit ONLY the fields in the schema below. Use "" for unknown strings and [] if no publication identifiers are found. Do not output booleans/strings instead of numbers.
-
-JSON schema describing the required output:
-{json_schema}
-
-Before replying, validate your draft against this schema. If the JSON does not pass validation, fix it and revalidate until it does. Output only the validated JSON; never include commentary or surrounding text.
-
-Output: respond ONLY with a single JSON object shaped as:
-{{
-"tool_name": "<derived display name>",
-"homepage": "<best homepage URL>",
-"publication_ids": ["DOI:...", "PMID:...", "PMCID:..."],
-"bio_subscores": {{"A1": <0|0.5|1>, "A2": <0|0.5|1>, "A3": <0|0.5|1>, "A4": <0|0.5|1>, "A5": <0|0.5|1>}},
-"documentation_subscores": {{"B1": <0|0.5|1>, "B2": <0|0.5|1>, "B3": <0|0.5|1>, "B4": <0|0.5|1>, "B5": <0|0.5|1>}},
-"confidence_score": <0–1 numeric confidence>,
-"concise_description": "<1–2 sentence rewritten summary>",
-"rationale": "<2–5 sentences citing specific evidence for both score groups; for each claim indicate the source as one of: homepage, documentation, repository, abstract, full_text, tags; explicitly name missing items as 'insufficient evidence: ...'>"
-}}"""
+            template = DEFAULT_CONFIG_YAML["scoring_prompt_template"]
 
         publication_ids = candidate.get("publication_ids") or []
         documentation_value = candidate.get("documentation")
 
@@ -75,19 +75,35 @@ def _run_impl(
         max=1.0,
         help="Legacy combined threshold applied to both bio and documentation scores when separate thresholds are not provided.",
     ),
-    min_bio_score: float | None = typer.Option(
+    min_bio_score_add: float | None = typer.Option(
         None,
+        "--min-bio-score-add",
         "--min-bio-score",
         min=0.0,
         max=1.0,
-        help="Minimum bio score required for inclusion (overrides pipeline.min_bio_score).",
+        help="Minimum bio score required for an automatic add decision (overrides pipeline.bio_score_thresholds.add).",
     ),
-    min_doc_score: float | None = typer.Option(
+    min_bio_score_review: float | None = typer.Option(
         None,
+        "--min-bio-score-review",
+        min=0.0,
+        max=1.0,
+        help="Minimum bio score required to trigger a manual review decision (overrides pipeline.bio_score_thresholds.review).",
+    ),
+    min_doc_score_add: float | None = typer.Option(
+        None,
+        "--min-doc-score-add",
         "--min-doc-score",
         min=0.0,
         max=1.0,
-        help="Minimum documentation score required for inclusion (overrides pipeline.min_documentation_score).",
+        help="Minimum documentation score required for an automatic add decision (overrides pipeline.documentation_score_thresholds.add).",
+    ),
+    min_doc_score_review: float | None = typer.Option(
+        None,
+        "--min-doc-score-review",
+        min=0.0,
+        max=1.0,
+        help="Minimum documentation score required to trigger a manual review decision (overrides pipeline.documentation_score_thresholds.review).",
     ),
     limit: int | None = typer.Option(
         None, "--limit", help="Max candidates to process."
@@ -234,11 +250,10 @@ def _run_impl(
             registry_path = config_registry
 
     if resume_from_pub2tools and input_path:
-        typer.echo(
-            "--resume-from-pub2tools cannot be used together with --input or pipeline.input_path",
-            err=True,
+        raise typer.BadParameter(
+            "cannot be used together with --input or pipeline.input_path",
+            param_hint="--resume-from-pub2tools",
         )
-        raise typer.Exit(code=2)
 
     # Determine score thresholds (CLI > legacy min-score > config > default)
     def _coerce_threshold(value, default):
@@ -247,24 +262,66 @@ def _coerce_threshold(value, default):
         except (TypeError, ValueError):
             return default
 
-    config_min_bio = pipeline_cfg.get("min_bio_score")
-    config_min_doc = pipeline_cfg.get("min_documentation_score")
+    bio_cfg = pipeline_cfg.get("bio_score_thresholds") or {}
+    doc_cfg = pipeline_cfg.get("documentation_score_thresholds") or {}
+
+    config_bio_add = bio_cfg.get("add")
+    config_bio_review = bio_cfg.get("review")
+    config_doc_add = doc_cfg.get("add")
+    config_doc_review = doc_cfg.get("review")
+
+    # Legacy key fallbacks
+    if config_bio_add is None:
+        config_bio_add = pipeline_cfg.get("min_bio_score")
+    if config_doc_add is None:
+        config_doc_add = pipeline_cfg.get("min_documentation_score")
+    if config_bio_review is None:
+        config_bio_review = pipeline_cfg.get("min_bio_score_review")
+    if config_doc_review is None:
+        config_doc_review = pipeline_cfg.get("min_documentation_score_review")
+
+    DEFAULT_BIO_ADD = 0.6
+    DEFAULT_BIO_REVIEW = 0.5
+    DEFAULT_DOC_ADD = 0.6
+    DEFAULT_DOC_REVIEW = 0.5
 
     if min_score is not None:
-        if min_bio_score is None:
-            min_bio_score = min_score
-        if min_doc_score is None:
-            min_doc_score = min_score
-
-    if min_bio_score is None:
-        min_bio_score = _coerce_threshold(config_min_bio, 0.6)
-    else:
-        min_bio_score = _coerce_threshold(min_bio_score, 0.6)
-
-    if min_doc_score is None:
-        min_doc_score = _coerce_threshold(config_min_doc, 0.6)
-    else:
-        min_doc_score = _coerce_threshold(min_doc_score, 0.6)
+        if min_bio_score_add is None:
+            min_bio_score_add = min_score
+        if min_doc_score_add is None:
+            min_doc_score_add = min_score
+        if min_bio_score_review is None:
+            min_bio_score_review = min_score
+        if min_doc_score_review is None:
+            min_doc_score_review = min_score
+
+    bio_add_threshold = _coerce_threshold(
+        min_bio_score_add if min_bio_score_add is not None else config_bio_add,
+        DEFAULT_BIO_ADD,
+    )
+    doc_add_threshold = _coerce_threshold(
+        min_doc_score_add if min_doc_score_add is not None else config_doc_add,
+        DEFAULT_DOC_ADD,
+    )
+
+    bio_review_threshold = _coerce_threshold(
+        min_bio_score_review if min_bio_score_review is not None else config_bio_review,
+        DEFAULT_BIO_REVIEW,
+    )
+    doc_review_threshold = _coerce_threshold(
+        min_doc_score_review if min_doc_score_review is not None else config_doc_review,
+        DEFAULT_DOC_REVIEW,
+    )
+
+    bio_review_threshold = max(0.0, min(bio_review_threshold, 1.0))
+    bio_add_threshold = max(0.0, min(bio_add_threshold, 1.0))
+    doc_review_threshold = max(0.0, min(doc_review_threshold, 1.0))
+    doc_add_threshold = max(0.0, min(doc_add_threshold, 1.0))
+
+    if bio_review_threshold > bio_add_threshold:
+        bio_review_threshold = bio_add_threshold
+    if doc_review_threshold > doc_add_threshold:
+        doc_review_threshold = doc_add_threshold
 
     # Set logging level
     import logging
@@ -292,8 +349,8 @@ def _coerce_threshold(value, default):
         execute_run(
             from_date=from_date,
             to_date=to_date,
-            min_bio_score=min_bio_score,
-            min_doc_score=min_doc_score,
+            bio_thresholds=(bio_review_threshold, bio_add_threshold),
+            doc_thresholds=(doc_review_threshold, doc_add_threshold),
             limit=limit,
             dry_run=dry_run,
             model=model,
@@ -315,7 +372,6 @@ def _coerce_threshold(value, default):
         )
     except Exception as e:
         import traceback
-        import typer
 
         typer.echo("\nERROR: Unhandled exception in pipeline:", err=True)
         typer.echo(str(e), err=True)