|
3 | 3 | from typing import Any, Dict, List, Optional, Tuple, Union |
4 | 4 |
|
5 | 5 | from .ollama_client import OllamaClient, OllamaConnectionError |
6 | | -from biotoolsllmannotate.config import get_config_yaml |
| 6 | +from biotoolsllmannotate.config import DEFAULT_CONFIG_YAML, get_config_yaml |
7 | 7 | from biotoolsllmannotate.enrich import is_probable_publication_url |
8 | 8 |
|
9 | 9 |
|
@@ -283,6 +283,43 @@ def _score_from_response( |
283 | 283 | return clamp_score(averaged), breakdown |
284 | 284 |
|
285 | 285 |
|
| 286 | +def _documentation_score_v2(breakdown, fallback: float | None) -> float: |
| 287 | + weights = {"B1": 2.0, "B2": 1.0, "B3": 1.0, "B4": 1.0, "B5": 2.0} |
| 288 | + denominator = sum(weights.values()) |
| 289 | + |
| 290 | + if isinstance(breakdown, Mapping): |
| 291 | + numerator = 0.0 |
| 292 | + have_any = False |
| 293 | + for key, weight in weights.items(): |
| 294 | + raw = breakdown.get(key) |
| 295 | + if raw is not None: |
| 296 | + have_any = True |
| 297 | + value = _coerce_float(raw) |
| 298 | + if value is None: |
| 299 | + value = 0.0 |
| 300 | + numerator += clamp_score(value) * weight |
| 301 | + if have_any: |
| 302 | + return clamp_score(numerator / denominator) |
| 303 | + |
| 304 | + if isinstance(breakdown, Sequence) and not isinstance(breakdown, (str, bytes)): |
| 305 | + items = list(breakdown) |
| 306 | + numerator = 0.0 |
| 307 | + have_any = False |
| 308 | + for idx, key in enumerate(weights): |
| 309 | + raw = items[idx] if idx < len(items) else None |
| 310 | + if raw is not None: |
| 311 | + have_any = True |
| 312 | + value = _coerce_float(raw) |
| 313 | + if value is None: |
| 314 | + value = 0.0 |
| 315 | + numerator += clamp_score(value) * list(weights.values())[idx] |
| 316 | + if have_any: |
| 317 | + return clamp_score(numerator / denominator) |
| 318 | + |
| 319 | + fallback_value = fallback if fallback is not None else 0.0 |
| 320 | + return clamp_score(fallback_value) |
| 321 | + |
| 322 | + |
286 | 323 | def _candidate_homepage(candidate: dict) -> str: |
287 | 324 | homepage = candidate.get("homepage") |
288 | 325 | if isinstance(homepage, str): |
@@ -445,74 +482,17 @@ def score_candidate(self, candidate: Dict[str, Any]) -> Dict[str, Any]: |
445 | 482 | } |
446 | 483 | result["bio_subscores"] = bio_breakdown or {} |
447 | 484 | result["documentation_subscores"] = doc_breakdown or {} |
| 485 | + doc_score_v2 = _documentation_score_v2(doc_breakdown, doc_score) |
| 486 | + if doc_score_v2 != doc_score: |
| 487 | + result["documentation_score_raw"] = doc_score |
| 488 | + result["doc_score_v2"] = doc_score_v2 |
| 489 | + result["documentation_score"] = doc_score_v2 |
448 | 490 | return result |
449 | 491 |
|
450 | 492 | def _build_prompt(self, candidate: dict) -> str: |
451 | 493 | template = self.config.get("scoring_prompt_template") |
452 | 494 | if not template: |
453 | | - template = """You are evaluating whether a software resource is worth getting registered in bio.tools, the registry for software resources in the life sciences. |
454 | | -
|
455 | | -Available material: |
456 | | -
|
457 | | -Title: {title} |
458 | | -Description: {description} |
459 | | -Homepage: {homepage} |
460 | | -Homepage status: {homepage_status} |
461 | | -Homepage error: {homepage_error} |
462 | | -Documentation links: {documentation} |
463 | | -Documentation keywords found on homepage: {documentation_keywords} |
464 | | -Repository: {repository} |
465 | | -Found keywords: {tags} |
466 | | -Published: {published_at} |
467 | | -Publication abstract: {publication_abstract} |
468 | | -Publication full text: {publication_full_text} |
469 | | -Known publication identifiers: {publication_ids} |
470 | | -
|
471 | | -Task: |
472 | | -Score the resource using the rubric below. For every subcriterion assign exactly one of {{0, 0.5, 1}}. Base every decision only on the provided material. Do not invent facts or URLs. If the resource is not life-science software, set ALL bio subcriteria A1–A5 = 0 and explain why in the rationale. |
473 | | -
|
474 | | -Bio score rubric |
475 | | -A1 Biological intent stated (explicit life-science task/domain). |
476 | | -A2 Operations on biological data described |
477 | | -A3 Software with biological data I/O: 0 = none; 0.5 = only generic; 1 = concrete datatypes/formats named. |
478 | | -A4 Modality explicitly classifiable as one or more of: database portal, desktop application, web application, web API, web service, SPARQL endpoint, command-line tool (CLI), workbench, suite, plug-in, workflow, library, ontology. Include minimal usage context. |
479 | | -A5 Evidence of bio use (examples on real bio data OR peer-reviewed/benchmark citation). |
480 | | -
|
481 | | -Documentation score rubric (subcriteria only; no overall score here) |
482 | | -B1 Documentation completeness (e.g. manual, guide, readthedocs). |
483 | | -B2 Installation pathways (e.g. installation/setup, config, container, package). |
484 | | -B3 Reproducibility aids (e.g. doi, release). |
485 | | -B4 Maintenance signal (e.g. commits, issue tracker, news). |
486 | | -B5 Onboarding & support (e.g. quickstart/tutorial, contact, faq). |
487 | | -
|
488 | | -Selection/normalization rules: |
489 | | -
|
490 | | -Base every decision on the supplied material only. |
491 | | -Normalize publication identifiers to prefixes: DOI:..., PMID:..., PMCID:... and remove duplicates (case-insensitive). |
492 | | -For any subcriterion scored 0 due to missing evidence, mention "insufficient evidence: <item>" in the rationale. |
493 | | -Record each bio subcriterion as numbers {{0,0.5,1}} in `bio_subscores` and each documentation subcriterion as numbers {{0,0.5,1}} in `documentation_subscores`. |
494 | | -Provide `confidence_score` as a number between 0 and 1 summarizing your certainty in the assessment (higher means more confident). |
495 | | -Do NOT compute aggregate scores; only fill the provided fields. |
496 | | -Do not output any value outside [0.0, 1.0]. |
497 | | -Always emit every field in the output JSON exactly once. |
498 | | -Emit ONLY the fields in the schema below. Use "" for unknown strings and [] if no publication identifiers are found. Do not output booleans/strings instead of numbers. |
499 | | -
|
500 | | -JSON schema describing the required output: |
501 | | -{json_schema} |
502 | | -
|
503 | | -Before replying, validate your draft against this schema. If the JSON does not pass validation, fix it and revalidate until it does. Output only the validated JSON; never include commentary or surrounding text. |
504 | | -
|
505 | | -Output: respond ONLY with a single JSON object shaped as: |
506 | | -{{ |
507 | | -"tool_name": "<derived display name>", |
508 | | -"homepage": "<best homepage URL>", |
509 | | -"publication_ids": ["DOI:...", "PMID:...", "PMCID:..."], |
510 | | -"bio_subscores": {{"A1": <0|0.5|1>, "A2": <0|0.5|1>, "A3": <0|0.5|1>, "A4": <0|0.5|1>, "A5": <0|0.5|1>}}, |
511 | | -"documentation_subscores": {{"B1": <0|0.5|1>, "B2": <0|0.5|1>, "B3": <0|0.5|1>, "B4": <0|0.5|1>, "B5": <0|0.5|1>}}, |
512 | | -"confidence_score": <0–1 numeric confidence>, |
513 | | -"concise_description": "<1–2 sentence rewritten summary>", |
514 | | -"rationale": "<2–5 sentences citing specific evidence for both score groups; for each claim indicate the source as one of: homepage, documentation, repository, abstract, full_text, tags; explicitly name missing items as 'insufficient evidence: ...'>" |
515 | | -}}""" |
| 495 | + template = DEFAULT_CONFIG_YAML["scoring_prompt_template"] |
516 | 496 |
|
517 | 497 | publication_ids = candidate.get("publication_ids") or [] |
518 | 498 | documentation_value = candidate.get("documentation") |
|
0 commit comments