Skip to content

Commit d063428

Browse files
authored
chore: add initial github ci & fix tests (#17)
* chore: add github ci fix: tests * chore: offline mode paring code deduplication
1 parent a2bfad3 commit d063428

File tree

10 files changed

+364
-705
lines changed

10 files changed

+364
-705
lines changed

.github/workflows/ci.yml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
name: Tests
2+
3+
on:
4+
push:
5+
pull_request:
6+
workflow_dispatch:
7+
8+
jobs:
9+
pytest:
10+
name: Pytest (${{ matrix.python-version }})
11+
runs-on: ubuntu-latest
12+
strategy:
13+
matrix:
14+
python-version: ["3.10", "3.11", "3.12", "3.13"]
15+
16+
steps:
17+
- uses: actions/checkout@v4
18+
19+
- name: Set up Python
20+
uses: actions/setup-python@v5
21+
with:
22+
python-version: ${{ matrix.python-version }}
23+
cache: "pip"
24+
25+
- name: Install dependencies
26+
run: |
27+
python -m pip install --upgrade pip
28+
python -m pip install -e . pytest
29+
30+
- name: Run tests
31+
run: python -m pytest tests

deepdoc/common/misc_utils.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#
1616

1717
import logging
18+
import os
1819

1920
logger = logging.getLogger(__name__)
2021

@@ -31,3 +32,13 @@ def pip_install_torch():
3132
return True
3233
except ImportError:
3334
return False
35+
36+
37+
def parse_bool(value: str | None, default: bool = False) -> bool:
38+
if value is None:
39+
return default
40+
return value.strip().lower() in {"1", "true", "yes", "on"}
41+
42+
43+
def offline_mode_or_from_env(offline: bool | None = None) -> bool:
44+
return offline if offline is not None else parse_bool(os.getenv("DEEPDOC_OFFLINE"), default=False)

deepdoc/common/model_store.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,14 @@
2222
import logging
2323
import os
2424
from dataclasses import dataclass
25+
from importlib import resources
2526
from pathlib import Path
27+
from ..common.misc_utils import offline_mode_or_from_env
2628

2729

2830
GLOBAL_MODELSCOPE_REPO_ENV = "DEEPDOC_MODELSCOPE_REPO"
2931
GLOBAL_MODELSCOPE_REVISION_ENV = "DEEPDOC_MODELSCOPE_REVISION"
30-
31-
32-
def _parse_bool(value: str | None, default: bool = False) -> bool:
33-
if value is None:
34-
return default
35-
return value.strip().lower() in {"1", "true", "yes", "on"}
32+
TOKENIZER_MODEL_DIR_ENV = "DEEPDOC_TOKENIZER_MODEL_DIR"
3633

3734

3835
def _normalize_provider(provider: str | None) -> str:
@@ -56,6 +53,18 @@ def _model_home_path(model_home: str | None) -> Path:
5653
return Path.home().joinpath(".cache", "deepdoc")
5754

5855

56+
def _resolve_tokenizer_dict_path() -> Path:
57+
configured_dir = os.getenv(TOKENIZER_MODEL_DIR_ENV)
58+
if configured_dir:
59+
dictionary = Path(configured_dir).expanduser().resolve().joinpath("huqie.txt")
60+
else:
61+
dictionary = Path(str(resources.files("deepdoc").joinpath("dict", "huqie.txt"))).resolve()
62+
63+
if not dictionary.exists():
64+
raise FileNotFoundError("Tokenizer dictionary not found: {}. Set {} to a directory containing huqie.txt.".format(dictionary, TOKENIZER_MODEL_DIR_ENV))
65+
return dictionary
66+
67+
5968
@dataclass(frozen=True)
6069
class BundleSpec:
6170
name: str
@@ -243,7 +252,7 @@ def resolve_bundle_dir(
243252

244253
spec = BUNDLES[bundle]
245254
provider_name = _normalize_provider(provider)
246-
offline_mode = offline if offline is not None else _parse_bool(os.getenv("DEEPDOC_OFFLINE"), default=False)
255+
offline_mode = offline_mode_or_from_env(offline)
247256

248257
explicit_local = os.getenv(spec.local_dir_env)
249258
if explicit_local:
@@ -356,5 +365,5 @@ def resolve_tokenizer_dict_prefix(
356365
provider: str | None = None,
357366
offline: bool | None = None,
358367
) -> str:
359-
bundle_dir = Path(resolve_bundle_dir("tokenizer", model_home=model_home, provider=provider, offline=offline))
360-
return str(bundle_dir.joinpath("huqie"))
368+
del model_home, provider, offline
369+
return str(_resolve_tokenizer_dict_path().with_suffix(""))

deepdoc/config.py

Lines changed: 6 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,11 @@
1111
resolve_xgb_model_dir,
1212
validate_bundle_dir,
1313
)
14+
from .common.misc_utils import offline_mode_or_from_env
1415

1516
ProviderType = Literal["local", "modelscope", "auto"]
1617

1718

18-
def _parse_bool(value: str | None, default: bool = False) -> bool:
19-
if value is None:
20-
return default
21-
return value.strip().lower() in {"1", "true", "yes", "on"}
22-
23-
2419
def _normalize_provider(provider: str) -> ProviderType:
2520
normalized = provider.strip().lower()
2621
aliases = {
@@ -31,9 +26,7 @@ def _normalize_provider(provider: str) -> ProviderType:
3126
}
3227
normalized = aliases.get(normalized, normalized)
3328
if normalized not in {"local", "modelscope", "auto"}:
34-
raise ValueError(
35-
"Unsupported model provider '{}'. Use one of: local, modelscope, auto.".format(provider)
36-
)
29+
raise ValueError("Unsupported model provider '{}'. Use one of: local, modelscope, auto.".format(provider))
3730
return normalized # type: ignore[return-value]
3831

3932

@@ -64,13 +57,10 @@ def resolve_dict_path(self) -> str:
6457
if dictionary.is_dir():
6558
dictionary = dictionary.joinpath("huqie.txt")
6659
if dictionary.suffix != ".txt":
67-
raise ValueError(
68-
"TokenizerConfig.dict_path must point to a '.txt' dictionary file, got: {}".format(dictionary)
69-
)
60+
raise ValueError("TokenizerConfig.dict_path must point to a '.txt' dictionary file, got: {}".format(dictionary))
7061
_require_file(
7162
dictionary,
72-
"Tokenizer dictionary not found: {}. Provide a valid TokenizerConfig.dict_path."
73-
.format(dictionary),
63+
"Tokenizer dictionary not found: {}. Provide a valid TokenizerConfig.dict_path.".format(dictionary),
7464
)
7565
return str(dictionary)
7666

@@ -86,7 +76,7 @@ def from_env(cls) -> "TokenizerConfig":
8676

8777
return cls(
8878
dict_path=dict_path,
89-
offline=_parse_bool(os.getenv("DEEPDOC_OFFLINE"), default=False),
79+
offline=offline_mode_or_from_env(None),
9080
nltk_data_dir=os.getenv("DEEPDOC_NLTK_DATA_DIR"),
9181
)
9282

@@ -107,10 +97,7 @@ def _resolve_bundle_dir(self, bundle: str, explicit_dir: str | None) -> str:
10797
candidate = Path(explicit_dir).expanduser().resolve()
10898
exists, missing = validate_bundle_dir(bundle, candidate)
10999
if not exists:
110-
raise FileNotFoundError(
111-
"Missing required files for '{}' bundle in {}: {}"
112-
.format(bundle, candidate, ", ".join(missing))
113-
)
100+
raise FileNotFoundError("Missing required files for '{}' bundle in {}: {}".format(bundle, candidate, ", ".join(missing)))
114101
return str(candidate)
115102

116103
model_provider = self.normalized_provider()

deepdoc/depend/nltk_manager.py

Lines changed: 5 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,13 @@
22
import os
33
import threading
44
from pathlib import Path
5+
from ..common.misc_utils import offline_mode_or_from_env
56

67
logger = logging.getLogger(__name__)
78

89
_RESOURCE_SPECS: tuple[tuple[str, tuple[str, ...]], ...] = (
9-
(
10-
"punkt",
11-
(
12-
"tokenizers/punkt",
13-
"tokenizers/punkt.zip",
14-
"tokenizers/punkt_tab",
15-
"tokenizers/punkt_tab.zip",
16-
),
17-
),
10+
("punkt", ("tokenizers/punkt", "tokenizers/punkt.zip")),
11+
("punkt_tab", ("tokenizers/punkt_tab", "tokenizers/punkt_tab.zip")),
1812
("wordnet", ("corpora/wordnet", "corpora/wordnet.zip")),
1913
(
2014
"averaged_perceptron_tagger",
@@ -31,12 +25,6 @@
3125
_ensured_keys: set[tuple[str, bool]] = set()
3226

3327

34-
def _parse_bool(value: str | None, default: bool = False) -> bool:
35-
if value is None:
36-
return default
37-
return value.strip().lower() in {"1", "true", "yes", "on"}
38-
39-
4028
def _resolve_nltk_data_dir(data_dir: str | None) -> Path | None:
4129
# Resolution precedence:
4230
# 1) explicit arg
@@ -88,7 +76,7 @@ def ensure_nltk_data(
8876
import nltk
8977

9078
resolved_dir = _resolve_nltk_data_dir(data_dir)
91-
offline_mode = offline if offline is not None else _parse_bool(os.getenv("DEEPDOC_OFFLINE"), default=False)
79+
offline_mode = offline_mode_or_from_env(offline)
9280
auto_download_mode = not offline_mode
9381

9482
_ensure_search_path(nltk, resolved_dir)
@@ -117,10 +105,7 @@ def ensure_nltk_data(
117105
if missing_packages:
118106
searched_paths = ", ".join(nltk.data.path)
119107
raise RuntimeError(
120-
"Missing required NLTK packages: {}. Searched paths: {}. "
121-
"Set DEEPDOC_NLTK_DATA_DIR to a local NLTK data path, or disable offline mode by setting "
122-
"DEEPDOC_OFFLINE=0."
123-
.format(
108+
"Missing required NLTK packages: {}. Searched paths: {}. Set DEEPDOC_NLTK_DATA_DIR to a local NLTK data path, or disable offline mode by setting DEEPDOC_OFFLINE=0.".format(
124109
", ".join(missing_packages),
125110
searched_paths,
126111
)

pyproject.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,11 @@ extend-select = ["ASYNC", "ASYNC1"]
8989
ignore = ["E402"]
9090

9191
[tool.pytest.ini_options]
92+
addopts = [
93+
"--ignore=tests/test_data",
94+
"--ignore=tests/test_results",
95+
]
96+
testpaths = ["tests"]
9297
markers = [
9398
"p1: high priority test cases",
9499
"p2: medium priority test cases",

tests/test_model_store.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,6 @@ def _create_combined_repo_layout(root: Path) -> None:
2222
for name in ms.BUNDLES["xgb"].required_files:
2323
_touch(root / "xgb" / name)
2424

25-
# Tokenizer bundle
26-
for name in ms.BUNDLES["tokenizer"].required_files:
27-
_touch(root / "tokenizer" / name)
28-
2925

3026
class TestModelStoreSharedRepo(unittest.TestCase):
3127
def setUp(self) -> None:
@@ -68,12 +64,10 @@ def snapshot_download(
6864
with patch.object(ms, "_import_modelscope_snapshot_download", return_value=snapshot_download):
6965
vision_dir = Path(ms.resolve_bundle_dir("vision", model_home=tmp, provider="modelscope", offline=False))
7066
xgb_dir = Path(ms.resolve_bundle_dir("xgb", model_home=tmp, provider="modelscope", offline=False))
71-
tok_dir = Path(ms.resolve_bundle_dir("tokenizer", model_home=tmp, provider="modelscope", offline=False))
7267

7368
expected_root = (Path(tmp) / "modelscope" / "Xorbits__deepdoc" / "v1").resolve()
7469
self.assertEqual(vision_dir.resolve(), (expected_root / "vision").resolve())
7570
self.assertEqual(xgb_dir.resolve(), (expected_root / "xgb").resolve())
76-
self.assertEqual(tok_dir.resolve(), (expected_root / "tokenizer").resolve())
7771

7872
self.assertGreaterEqual(len(calls), 1)
7973
for call in calls:
@@ -134,8 +128,22 @@ def snapshot_download(*args, **kwargs) -> str: # pragma: no cover
134128
with patch.object(ms, "_import_modelscope_snapshot_download", return_value=snapshot_download):
135129
vision_dir = Path(ms.resolve_bundle_dir("vision", model_home=tmp, provider="auto", offline=False))
136130
xgb_dir = Path(ms.resolve_bundle_dir("xgb", model_home=tmp, provider="auto", offline=False))
137-
tok_dir = Path(ms.resolve_bundle_dir("tokenizer", model_home=tmp, provider="auto", offline=False))
138131

139132
self.assertEqual(vision_dir.resolve(), (expected_root / "vision").resolve())
140133
self.assertEqual(xgb_dir.resolve(), (expected_root / "xgb").resolve())
141-
self.assertEqual(tok_dir.resolve(), (expected_root / "tokenizer").resolve())
134+
135+
def test_resolve_tokenizer_dict_prefix_uses_packaged_dict_by_default(self) -> None:
136+
prefix = Path(ms.resolve_tokenizer_dict_prefix())
137+
138+
self.assertEqual(prefix.name, "huqie")
139+
self.assertTrue(prefix.with_suffix(".txt").exists())
140+
141+
def test_resolve_tokenizer_dict_prefix_uses_env_dir_when_set(self) -> None:
142+
with tempfile.TemporaryDirectory() as tmp:
143+
tokenizer_dir = Path(tmp) / "tokenizer"
144+
_touch(tokenizer_dir / "huqie.txt")
145+
os.environ[ms.TOKENIZER_MODEL_DIR_ENV] = str(tokenizer_dir)
146+
147+
prefix = Path(ms.resolve_tokenizer_dict_prefix())
148+
149+
self.assertEqual(prefix.resolve(), (tokenizer_dir / "huqie").resolve())

0 commit comments

Comments
 (0)