Skip to content

Commit bc0c780

Browse files
committed
chore: offline mode paring code deduplication
1 parent b185b5a commit bc0c780

File tree

4 files changed

+23
-41
lines changed

4 files changed

+23
-41
lines changed

deepdoc/common/misc_utils.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#
1616

1717
import logging
18+
import os
1819

1920
logger = logging.getLogger(__name__)
2021

@@ -31,3 +32,13 @@ def pip_install_torch():
3132
return True
3233
except ImportError:
3334
return False
35+
36+
37+
def parse_bool(value: str | None, default: bool = False) -> bool:
38+
if value is None:
39+
return default
40+
return value.strip().lower() in {"1", "true", "yes", "on"}
41+
42+
43+
def offline_mode_or_from_env(offline: bool | None = None) -> bool:
44+
return offline if offline is not None else parse_bool(os.getenv("DEEPDOC_OFFLINE"), default=False)

deepdoc/common/model_store.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,19 +24,14 @@
2424
from dataclasses import dataclass
2525
from importlib import resources
2626
from pathlib import Path
27+
from ..common.misc_utils import offline_mode_or_from_env
2728

2829

2930
GLOBAL_MODELSCOPE_REPO_ENV = "DEEPDOC_MODELSCOPE_REPO"
3031
GLOBAL_MODELSCOPE_REVISION_ENV = "DEEPDOC_MODELSCOPE_REVISION"
3132
TOKENIZER_MODEL_DIR_ENV = "DEEPDOC_TOKENIZER_MODEL_DIR"
3233

3334

34-
def _parse_bool(value: str | None, default: bool = False) -> bool:
35-
if value is None:
36-
return default
37-
return value.strip().lower() in {"1", "true", "yes", "on"}
38-
39-
4035
def _normalize_provider(provider: str | None) -> str:
4136
normalized = (provider or os.getenv("DEEPDOC_MODEL_PROVIDER", "auto")).strip().lower()
4237
aliases = {
@@ -66,10 +61,7 @@ def _resolve_tokenizer_dict_path() -> Path:
6661
dictionary = Path(str(resources.files("deepdoc").joinpath("dict", "huqie.txt"))).resolve()
6762

6863
if not dictionary.exists():
69-
raise FileNotFoundError(
70-
"Tokenizer dictionary not found: {}. Set {} to a directory containing huqie.txt."
71-
.format(dictionary, TOKENIZER_MODEL_DIR_ENV)
72-
)
64+
raise FileNotFoundError("Tokenizer dictionary not found: {}. Set {} to a directory containing huqie.txt.".format(dictionary, TOKENIZER_MODEL_DIR_ENV))
7365
return dictionary
7466

7567

@@ -260,7 +252,7 @@ def resolve_bundle_dir(
260252

261253
spec = BUNDLES[bundle]
262254
provider_name = _normalize_provider(provider)
263-
offline_mode = offline if offline is not None else _parse_bool(os.getenv("DEEPDOC_OFFLINE"), default=False)
255+
offline_mode = offline_mode_or_from_env(offline)
264256

265257
explicit_local = os.getenv(spec.local_dir_env)
266258
if explicit_local:

deepdoc/config.py

Lines changed: 6 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,11 @@
1111
resolve_xgb_model_dir,
1212
validate_bundle_dir,
1313
)
14+
from .common.misc_utils import offline_mode_or_from_env
1415

1516
ProviderType = Literal["local", "modelscope", "auto"]
1617

1718

18-
def _parse_bool(value: str | None, default: bool = False) -> bool:
19-
if value is None:
20-
return default
21-
return value.strip().lower() in {"1", "true", "yes", "on"}
22-
23-
2419
def _normalize_provider(provider: str) -> ProviderType:
2520
normalized = provider.strip().lower()
2621
aliases = {
@@ -31,9 +26,7 @@ def _normalize_provider(provider: str) -> ProviderType:
3126
}
3227
normalized = aliases.get(normalized, normalized)
3328
if normalized not in {"local", "modelscope", "auto"}:
34-
raise ValueError(
35-
"Unsupported model provider '{}'. Use one of: local, modelscope, auto.".format(provider)
36-
)
29+
raise ValueError("Unsupported model provider '{}'. Use one of: local, modelscope, auto.".format(provider))
3730
return normalized # type: ignore[return-value]
3831

3932

@@ -64,13 +57,10 @@ def resolve_dict_path(self) -> str:
6457
if dictionary.is_dir():
6558
dictionary = dictionary.joinpath("huqie.txt")
6659
if dictionary.suffix != ".txt":
67-
raise ValueError(
68-
"TokenizerConfig.dict_path must point to a '.txt' dictionary file, got: {}".format(dictionary)
69-
)
60+
raise ValueError("TokenizerConfig.dict_path must point to a '.txt' dictionary file, got: {}".format(dictionary))
7061
_require_file(
7162
dictionary,
72-
"Tokenizer dictionary not found: {}. Provide a valid TokenizerConfig.dict_path."
73-
.format(dictionary),
63+
"Tokenizer dictionary not found: {}. Provide a valid TokenizerConfig.dict_path.".format(dictionary),
7464
)
7565
return str(dictionary)
7666

@@ -86,7 +76,7 @@ def from_env(cls) -> "TokenizerConfig":
8676

8777
return cls(
8878
dict_path=dict_path,
89-
offline=_parse_bool(os.getenv("DEEPDOC_OFFLINE"), default=False),
79+
offline=offline_mode_or_from_env(None)
9080
nltk_data_dir=os.getenv("DEEPDOC_NLTK_DATA_DIR"),
9181
)
9282

@@ -107,10 +97,7 @@ def _resolve_bundle_dir(self, bundle: str, explicit_dir: str | None) -> str:
10797
candidate = Path(explicit_dir).expanduser().resolve()
10898
exists, missing = validate_bundle_dir(bundle, candidate)
10999
if not exists:
110-
raise FileNotFoundError(
111-
"Missing required files for '{}' bundle in {}: {}"
112-
.format(bundle, candidate, ", ".join(missing))
113-
)
100+
raise FileNotFoundError("Missing required files for '{}' bundle in {}: {}".format(bundle, candidate, ", ".join(missing)))
114101
return str(candidate)
115102

116103
model_provider = self.normalized_provider()

deepdoc/depend/nltk_manager.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import os
33
import threading
44
from pathlib import Path
5+
from ..common.misc_utils import offline_mode_or_from_env
56

67
logger = logging.getLogger(__name__)
78

@@ -24,12 +25,6 @@
2425
_ensured_keys: set[tuple[str, bool]] = set()
2526

2627

27-
def _parse_bool(value: str | None, default: bool = False) -> bool:
28-
if value is None:
29-
return default
30-
return value.strip().lower() in {"1", "true", "yes", "on"}
31-
32-
3328
def _resolve_nltk_data_dir(data_dir: str | None) -> Path | None:
3429
# Resolution precedence:
3530
# 1) explicit arg
@@ -81,7 +76,7 @@ def ensure_nltk_data(
8176
import nltk
8277

8378
resolved_dir = _resolve_nltk_data_dir(data_dir)
84-
offline_mode = offline if offline is not None else _parse_bool(os.getenv("DEEPDOC_OFFLINE"), default=False)
79+
offline_mode = offline_mode_or_from_env(offline)
8580
auto_download_mode = not offline_mode
8681

8782
_ensure_search_path(nltk, resolved_dir)
@@ -110,10 +105,7 @@ def ensure_nltk_data(
110105
if missing_packages:
111106
searched_paths = ", ".join(nltk.data.path)
112107
raise RuntimeError(
113-
"Missing required NLTK packages: {}. Searched paths: {}. "
114-
"Set DEEPDOC_NLTK_DATA_DIR to a local NLTK data path, or disable offline mode by setting "
115-
"DEEPDOC_OFFLINE=0."
116-
.format(
108+
"Missing required NLTK packages: {}. Searched paths: {}. Set DEEPDOC_NLTK_DATA_DIR to a local NLTK data path, or disable offline mode by setting DEEPDOC_OFFLINE=0.".format(
117109
", ".join(missing_packages),
118110
searched_paths,
119111
)

0 commit comments

Comments
 (0)