Skip to content

Commit 2fcbd2c

Browse files
committed
refinement of ollama queries and posterior decision process
1 parent 31bd165 commit 2fcbd2c

File tree

19 files changed

+1855
-906
lines changed

19 files changed

+1855
-906
lines changed

build/lib/biotoolsllmannotate/__main__.py

Lines changed: 39 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,20 @@ def _fallback_main() -> None:
2121
p_run.add_argument("--from-date")
2222
p_run.add_argument("--to-date")
2323
p_run.add_argument("--min-score", type=float)
24-
p_run.add_argument("--min-bio-score", type=float)
25-
p_run.add_argument("--min-doc-score", type=float)
24+
p_run.add_argument(
25+
"--min-bio-score-add",
26+
"--min-bio-score",
27+
dest="min_bio_score_add",
28+
type=float,
29+
)
30+
p_run.add_argument("--min-bio-score-review", type=float)
31+
p_run.add_argument(
32+
"--min-doc-score-add",
33+
"--min-doc-score",
34+
dest="min_doc_score_add",
35+
type=float,
36+
)
37+
p_run.add_argument("--min-doc-score-review", type=float)
2638
p_run.add_argument("--limit", type=int)
2739
p_run.add_argument("--dry-run", action="store_true")
2840
p_run.add_argument("--model", default="llama3.2")
@@ -34,22 +46,35 @@ def _fallback_main() -> None:
3446
return
3547
if args.command == "run":
3648
from_date = args.from_date or "7d"
37-
min_bio_score = args.min_bio_score
38-
min_doc_score = args.min_doc_score
49+
bio_add = args.min_bio_score_add
50+
bio_review = args.min_bio_score_review
51+
doc_add = args.min_doc_score_add
52+
doc_review = args.min_doc_score_review
53+
3954
if args.min_score is not None:
40-
if min_bio_score is None:
41-
min_bio_score = args.min_score
42-
if min_doc_score is None:
43-
min_doc_score = args.min_score
44-
if min_bio_score is None:
45-
min_bio_score = 0.6
46-
if min_doc_score is None:
47-
min_doc_score = 0.6
55+
if bio_add is None:
56+
bio_add = args.min_score
57+
if bio_review is None:
58+
bio_review = args.min_score
59+
if doc_add is None:
60+
doc_add = args.min_score
61+
if doc_review is None:
62+
doc_review = args.min_score
63+
64+
bio_add = 0.6 if bio_add is None else max(0.0, min(bio_add, 1.0))
65+
doc_add = 0.6 if doc_add is None else max(0.0, min(doc_add, 1.0))
66+
bio_review = 0.5 if bio_review is None else max(0.0, min(bio_review, 1.0))
67+
doc_review = 0.5 if doc_review is None else max(0.0, min(doc_review, 1.0))
68+
69+
if bio_review > bio_add:
70+
bio_review = bio_add
71+
if doc_review > doc_add:
72+
doc_review = doc_add
4873
execute_run(
4974
from_date=from_date,
5075
to_date=args.to_date,
51-
min_bio_score=min_bio_score,
52-
min_doc_score=min_doc_score,
76+
bio_thresholds=(bio_review, bio_add),
77+
doc_thresholds=(doc_review, doc_add),
5378
limit=args.limit,
5479
dry_run=args.dry_run,
5580
model=args.model,

build/lib/biotoolsllmannotate/assess/scorer.py

Lines changed: 44 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from typing import Any, Dict, List, Optional, Tuple, Union
44

55
from .ollama_client import OllamaClient, OllamaConnectionError
6-
from biotoolsllmannotate.config import get_config_yaml
6+
from biotoolsllmannotate.config import DEFAULT_CONFIG_YAML, get_config_yaml
77
from biotoolsllmannotate.enrich import is_probable_publication_url
88

99

@@ -283,6 +283,43 @@ def _score_from_response(
283283
return clamp_score(averaged), breakdown
284284

285285

286+
def _documentation_score_v2(breakdown, fallback: float | None) -> float:
287+
weights = {"B1": 2.0, "B2": 1.0, "B3": 1.0, "B4": 1.0, "B5": 2.0}
288+
denominator = sum(weights.values())
289+
290+
if isinstance(breakdown, Mapping):
291+
numerator = 0.0
292+
have_any = False
293+
for key, weight in weights.items():
294+
raw = breakdown.get(key)
295+
if raw is not None:
296+
have_any = True
297+
value = _coerce_float(raw)
298+
if value is None:
299+
value = 0.0
300+
numerator += clamp_score(value) * weight
301+
if have_any:
302+
return clamp_score(numerator / denominator)
303+
304+
if isinstance(breakdown, Sequence) and not isinstance(breakdown, (str, bytes)):
305+
items = list(breakdown)
306+
numerator = 0.0
307+
have_any = False
308+
for idx, key in enumerate(weights):
309+
raw = items[idx] if idx < len(items) else None
310+
if raw is not None:
311+
have_any = True
312+
value = _coerce_float(raw)
313+
if value is None:
314+
value = 0.0
315+
numerator += clamp_score(value) * list(weights.values())[idx]
316+
if have_any:
317+
return clamp_score(numerator / denominator)
318+
319+
fallback_value = fallback if fallback is not None else 0.0
320+
return clamp_score(fallback_value)
321+
322+
286323
def _candidate_homepage(candidate: dict) -> str:
287324
homepage = candidate.get("homepage")
288325
if isinstance(homepage, str):
@@ -445,74 +482,17 @@ def score_candidate(self, candidate: Dict[str, Any]) -> Dict[str, Any]:
445482
}
446483
result["bio_subscores"] = bio_breakdown or {}
447484
result["documentation_subscores"] = doc_breakdown or {}
485+
doc_score_v2 = _documentation_score_v2(doc_breakdown, doc_score)
486+
if doc_score_v2 != doc_score:
487+
result["documentation_score_raw"] = doc_score
488+
result["doc_score_v2"] = doc_score_v2
489+
result["documentation_score"] = doc_score_v2
448490
return result
449491

450492
def _build_prompt(self, candidate: dict) -> str:
451493
template = self.config.get("scoring_prompt_template")
452494
if not template:
453-
template = """You are evaluating whether a software resource is worth getting registered in bio.tools, the registry for software resources in the life sciences.
454-
455-
Available material:
456-
457-
Title: {title}
458-
Description: {description}
459-
Homepage: {homepage}
460-
Homepage status: {homepage_status}
461-
Homepage error: {homepage_error}
462-
Documentation links: {documentation}
463-
Documentation keywords found on homepage: {documentation_keywords}
464-
Repository: {repository}
465-
Found keywords: {tags}
466-
Published: {published_at}
467-
Publication abstract: {publication_abstract}
468-
Publication full text: {publication_full_text}
469-
Known publication identifiers: {publication_ids}
470-
471-
Task:
472-
Score the resource using the rubric below. For every subcriterion assign exactly one of {{0, 0.5, 1}}. Base every decision only on the provided material. Do not invent facts or URLs. If the resource is not life-science software, set ALL bio subcriteria A1–A5 = 0 and explain why in the rationale.
473-
474-
Bio score rubric
475-
A1 Biological intent stated (explicit life-science task/domain).
476-
A2 Operations on biological data described
477-
A3 Software with biological data I/O: 0 = none; 0.5 = only generic; 1 = concrete datatypes/formats named.
478-
A4 Modality explicitly classifiable as one or more of: database portal, desktop application, web application, web API, web service, SPARQL endpoint, command-line tool (CLI), workbench, suite, plug-in, workflow, library, ontology. Include minimal usage context.
479-
A5 Evidence of bio use (examples on real bio data OR peer-reviewed/benchmark citation).
480-
481-
Documentation score rubric (subcriteria only; no overall score here)
482-
B1 Documentation completeness (e.g. manual, guide, readthedocs).
483-
B2 Installation pathways (e.g. installation/setup, config, container, package).
484-
B3 Reproducibility aids (e.g. doi, release).
485-
B4 Maintenance signal (e.g. commits, issue tracker, news).
486-
B5 Onboarding & support (e.g. quickstart/tutorial, contact, faq).
487-
488-
Selection/normalization rules:
489-
490-
Base every decision on the supplied material only.
491-
Normalize publication identifiers to prefixes: DOI:..., PMID:..., PMCID:... and remove duplicates (case-insensitive).
492-
For any subcriterion scored 0 due to missing evidence, mention "insufficient evidence: <item>" in the rationale.
493-
Record each bio subcriterion as numbers {{0,0.5,1}} in `bio_subscores` and each documentation subcriterion as numbers {{0,0.5,1}} in `documentation_subscores`.
494-
Provide `confidence_score` as a number between 0 and 1 summarizing your certainty in the assessment (higher means more confident).
495-
Do NOT compute aggregate scores; only fill the provided fields.
496-
Do not output any value outside [0.0, 1.0].
497-
Always emit every field in the output JSON exactly once.
498-
Emit ONLY the fields in the schema below. Use "" for unknown strings and [] if no publication identifiers are found. Do not output booleans/strings instead of numbers.
499-
500-
JSON schema describing the required output:
501-
{json_schema}
502-
503-
Before replying, validate your draft against this schema. If the JSON does not pass validation, fix it and revalidate until it does. Output only the validated JSON; never include commentary or surrounding text.
504-
505-
Output: respond ONLY with a single JSON object shaped as:
506-
{{
507-
"tool_name": "<derived display name>",
508-
"homepage": "<best homepage URL>",
509-
"publication_ids": ["DOI:...", "PMID:...", "PMCID:..."],
510-
"bio_subscores": {{"A1": <0|0.5|1>, "A2": <0|0.5|1>, "A3": <0|0.5|1>, "A4": <0|0.5|1>, "A5": <0|0.5|1>}},
511-
"documentation_subscores": {{"B1": <0|0.5|1>, "B2": <0|0.5|1>, "B3": <0|0.5|1>, "B4": <0|0.5|1>, "B5": <0|0.5|1>}},
512-
"confidence_score": <0–1 numeric confidence>,
513-
"concise_description": "<1–2 sentence rewritten summary>",
514-
"rationale": "<2–5 sentences citing specific evidence for both score groups; for each claim indicate the source as one of: homepage, documentation, repository, abstract, full_text, tags; explicitly name missing items as 'insufficient evidence: ...'>"
515-
}}"""
495+
template = DEFAULT_CONFIG_YAML["scoring_prompt_template"]
516496

517497
publication_ids = candidate.get("publication_ids") or []
518498
documentation_value = candidate.get("documentation")

build/lib/biotoolsllmannotate/cli/main.py

Lines changed: 83 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -75,19 +75,35 @@ def _run_impl(
7575
max=1.0,
7676
help="Legacy combined threshold applied to both bio and documentation scores when separate thresholds are not provided.",
7777
),
78-
min_bio_score: float | None = typer.Option(
78+
min_bio_score_add: float | None = typer.Option(
7979
None,
80+
"--min-bio-score-add",
8081
"--min-bio-score",
8182
min=0.0,
8283
max=1.0,
83-
help="Minimum bio score required for inclusion (overrides pipeline.min_bio_score).",
84+
help="Minimum bio score required for an automatic add decision (overrides pipeline.bio_score_thresholds.add).",
8485
),
85-
min_doc_score: float | None = typer.Option(
86+
min_bio_score_review: float | None = typer.Option(
8687
None,
88+
"--min-bio-score-review",
89+
min=0.0,
90+
max=1.0,
91+
help="Minimum bio score required to trigger a manual review decision (overrides pipeline.bio_score_thresholds.review).",
92+
),
93+
min_doc_score_add: float | None = typer.Option(
94+
None,
95+
"--min-doc-score-add",
8796
"--min-doc-score",
8897
min=0.0,
8998
max=1.0,
90-
help="Minimum documentation score required for inclusion (overrides pipeline.min_documentation_score).",
99+
help="Minimum documentation score required for an automatic add decision (overrides pipeline.documentation_score_thresholds.add).",
100+
),
101+
min_doc_score_review: float | None = typer.Option(
102+
None,
103+
"--min-doc-score-review",
104+
min=0.0,
105+
max=1.0,
106+
help="Minimum documentation score required to trigger a manual review decision (overrides pipeline.documentation_score_thresholds.review).",
91107
),
92108
limit: int | None = typer.Option(
93109
None, "--limit", help="Max candidates to process."
@@ -234,11 +250,10 @@ def _run_impl(
234250
registry_path = config_registry
235251

236252
if resume_from_pub2tools and input_path:
237-
typer.echo(
238-
"--resume-from-pub2tools cannot be used together with --input or pipeline.input_path",
239-
err=True,
253+
raise typer.BadParameter(
254+
"cannot be used together with --input or pipeline.input_path",
255+
param_hint="--resume-from-pub2tools",
240256
)
241-
raise typer.Exit(code=2)
242257

243258
# Determine score thresholds (CLI > legacy min-score > config > default)
244259
def _coerce_threshold(value, default):
@@ -247,24 +262,66 @@ def _coerce_threshold(value, default):
247262
except (TypeError, ValueError):
248263
return default
249264

250-
config_min_bio = pipeline_cfg.get("min_bio_score")
251-
config_min_doc = pipeline_cfg.get("min_documentation_score")
265+
bio_cfg = pipeline_cfg.get("bio_score_thresholds") or {}
266+
doc_cfg = pipeline_cfg.get("documentation_score_thresholds") or {}
267+
268+
config_bio_add = bio_cfg.get("add")
269+
config_bio_review = bio_cfg.get("review")
270+
config_doc_add = doc_cfg.get("add")
271+
config_doc_review = doc_cfg.get("review")
272+
273+
# Legacy key fallbacks
274+
if config_bio_add is None:
275+
config_bio_add = pipeline_cfg.get("min_bio_score")
276+
if config_doc_add is None:
277+
config_doc_add = pipeline_cfg.get("min_documentation_score")
278+
if config_bio_review is None:
279+
config_bio_review = pipeline_cfg.get("min_bio_score_review")
280+
if config_doc_review is None:
281+
config_doc_review = pipeline_cfg.get("min_documentation_score_review")
282+
283+
DEFAULT_BIO_ADD = 0.6
284+
DEFAULT_BIO_REVIEW = 0.5
285+
DEFAULT_DOC_ADD = 0.6
286+
DEFAULT_DOC_REVIEW = 0.5
252287

253288
if min_score is not None:
254-
if min_bio_score is None:
255-
min_bio_score = min_score
256-
if min_doc_score is None:
257-
min_doc_score = min_score
258-
259-
if min_bio_score is None:
260-
min_bio_score = _coerce_threshold(config_min_bio, 0.6)
261-
else:
262-
min_bio_score = _coerce_threshold(min_bio_score, 0.6)
263-
264-
if min_doc_score is None:
265-
min_doc_score = _coerce_threshold(config_min_doc, 0.6)
266-
else:
267-
min_doc_score = _coerce_threshold(min_doc_score, 0.6)
289+
if min_bio_score_add is None:
290+
min_bio_score_add = min_score
291+
if min_doc_score_add is None:
292+
min_doc_score_add = min_score
293+
if min_bio_score_review is None:
294+
min_bio_score_review = min_score
295+
if min_doc_score_review is None:
296+
min_doc_score_review = min_score
297+
298+
bio_add_threshold = _coerce_threshold(
299+
min_bio_score_add if min_bio_score_add is not None else config_bio_add,
300+
DEFAULT_BIO_ADD,
301+
)
302+
doc_add_threshold = _coerce_threshold(
303+
min_doc_score_add if min_doc_score_add is not None else config_doc_add,
304+
DEFAULT_DOC_ADD,
305+
)
306+
307+
bio_review_threshold = _coerce_threshold(
308+
min_bio_score_review if min_bio_score_review is not None else config_bio_review,
309+
DEFAULT_BIO_REVIEW,
310+
)
311+
doc_review_threshold = _coerce_threshold(
312+
min_doc_score_review if min_doc_score_review is not None else config_doc_review,
313+
DEFAULT_DOC_REVIEW,
314+
)
315+
316+
bio_review_threshold = max(0.0, min(bio_review_threshold, 1.0))
317+
bio_add_threshold = max(0.0, min(bio_add_threshold, 1.0))
318+
doc_review_threshold = max(0.0, min(doc_review_threshold, 1.0))
319+
doc_add_threshold = max(0.0, min(doc_add_threshold, 1.0))
320+
321+
if bio_review_threshold > bio_add_threshold:
322+
bio_review_threshold = bio_add_threshold
323+
if doc_review_threshold > doc_add_threshold:
324+
doc_review_threshold = doc_add_threshold
268325

269326
# Set logging level
270327
import logging
@@ -292,8 +349,8 @@ def _coerce_threshold(value, default):
292349
execute_run(
293350
from_date=from_date,
294351
to_date=to_date,
295-
min_bio_score=min_bio_score,
296-
min_doc_score=min_doc_score,
352+
bio_thresholds=(bio_review_threshold, bio_add_threshold),
353+
doc_thresholds=(doc_review_threshold, doc_add_threshold),
297354
limit=limit,
298355
dry_run=dry_run,
299356
model=model,
@@ -315,7 +372,6 @@ def _coerce_threshold(value, default):
315372
)
316373
except Exception as e:
317374
import traceback
318-
import typer
319375

320376
typer.echo("\nERROR: Unhandled exception in pipeline:", err=True)
321377
typer.echo(str(e), err=True)

0 commit comments

Comments
 (0)