xorbitsai
diff --git a/‎.github/workflows/publish.yml‎
Lines changed: 66 additions & 86 deletions b/‎.github/workflows/publish.yml‎
Lines changed: 66 additions & 86 deletions
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎MANIFEST.in‎
Lines changed: 3 additions & 4 deletions b/‎MANIFEST.in‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎README.md‎
Lines changed: 104 additions & 14 deletions b/‎README.md‎
Lines changed: 104 additions & 14 deletions
diff --git a/‎deepdoc/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎deepdoc/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎deepdoc/common/__init__.py‎
Lines changed: 14 additions & 0 deletions b/‎deepdoc/common/__init__.py‎
Lines changed: 14 additions & 0 deletions
@@ -1,96 +1,76 @@
-name: Publish Python 🐍 distribution 📦 to GitHub Pages
+name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI
 
-on:
-  push:
-    branches:
-      - master
-  workflow_dispatch:
-
-permissions:
-  contents: write
-  pages: write
-  id-token: write
-
-concurrency:
-  group: "pages"
-  cancel-in-progress: false
+on: push
 
 jobs:
-  build:
-    name: Build distribution 📦
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        persist-credentials: false
-        fetch-depth: 0
-    
-    - name: Set up Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: "3.x"
-    
-    - name: Install pypa/build
-      run: |
-        python3 -m pip install build --user
-    
-    - name: Build a binary wheel and a source tarball
-      run: python3 -m build
-    
-    - name: Store the distribution packages
-      uses: actions/upload-artifact@v4
-      with:
-        name: python-package-distributions
-        path: dist/
-
-  publish-to-github-pages:
-    name: Publish 📦 to GitHub Pages
-    needs: build
-    runs-on: ubuntu-latest
-    environment:
-      name: github-pages
-      url: ${{ steps.deployment.outputs.page_url }}
+    build:
+        name: Build distribution 📦
+        runs-on: ubuntu-latest
 
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v4
-      with:
-        fetch-depth: 0
+        steps:
+            - uses: actions/checkout@v4
+              with:
+                  persist-credentials: false
+            - name: Set up Python
+              uses: actions/setup-python@v5
+              with:
+                  python-version: "3.x"
+            - name: Install pypa/build
+              run: >-
+                  python3 -m
+                  pip install
+                  build
+                  --user
+            - name: Build a binary wheel and a source tarball
+              run: python3 -m build
+            - name: Store the distribution packages
+              uses: actions/upload-artifact@v4
+              with:
+                  name: python-package-distributions
+                  path: dist/
 
-    - name: Download all the dists
-      uses: actions/download-artifact@v4
-      with:
-        name: python-package-distributions
-        path: dist/
+    publish-to-pypi:
+        name: >-
+            Publish Python 🐍 distribution 📦 to PyPI
+        if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes
+        needs:
+            - build
+        runs-on: ubuntu-latest
+        environment:
+            name: pypi
+            url: https://pypi.org/p/deepdoc-lib
+        permissions:
+            id-token: write # IMPORTANT: mandatory for trusted publishing
 
-    - name: Install dumb-pypi
-      run: pip install dumb-pypi
+        steps:
+            - name: Download all the dists
+              uses: actions/download-artifact@v4
+              with:
+                  name: python-package-distributions
+                  path: dist/
+            - name: Publish distribution 📦 to PyPI
+              uses: pypa/gh-action-pypi-publish@release/v1
 
-    - name: Create package index
-      run: |
-        # Put wheels in packages directory
-        mkdir -p index/packages
-        cp dist/*.whl index/packages/
-        
-        # Create package list
-        ls index/packages/*.whl | xargs -n 1 basename > package_list.txt
-        
-        # Generate index pointing to ../packages/
-        dumb-pypi \
-          --package-list package_list.txt \
-          --packages-url ../../packages/ \
-          --output-dir index \
-          --title "Deepdoc PyPI"
+    publish-to-testpypi:
+        name: Publish Python 🐍 distribution 📦 to TestPyPI
+        needs:
+            - build
+        runs-on: ubuntu-latest
 
-    - name: Setup Pages
-      uses: actions/configure-pages@v5
+        environment:
+            name: testpypi
+            url: https://test.pypi.org/p/deepdoc-lib
 
-    - name: Upload artifact
-      uses: actions/upload-pages-artifact@v3
-      with:
-        path: 'index'
+        permissions:
+            id-token: write # IMPORTANT: mandatory for trusted publishing
 
-    - name: Deploy to GitHub Pages
-      id: deployment
-      uses: actions/deploy-pages@v4
+        steps:
+            - name: Download all the dists
+              uses: actions/download-artifact@v4
+              with:
+                  name: python-package-distributions
+                  path: dist/
+            - name: Publish distribution 📦 to TestPyPI
+              uses: pypa/gh-action-pypi-publish@release/v1
+              with:
+                  repository-url: https://test.pypi.org/legacy/
@@ -219,3 +219,6 @@ __marimo__/
 .streamlit/secrets.toml
 
 .DS_Store
+
+# Tokenizer trie cache
+deepdoc/dict/*.trie
@@ -11,10 +11,9 @@ recursive-include deepdoc *.json
 recursive-include deepdoc *.yaml
 recursive-include deepdoc *.yml
 recursive-include deepdoc *.csv
-recursive-include deepdoc *.onnx
-recursive-include deepdoc *.model
-recursive-include deepdoc *.res
-recursive-include deepdoc *.trie
+
+# Exclude heavyweight model artifacts (resolved at runtime)
+prune deepdoc/rag/res
 
 # Exclude unwanted files
 global-exclude *.pyc
 
@@ -5,13 +5,13 @@
 CPU-only (default):
 
 ```bash
-pip install git+https://github.com/xorbitsai/deepdoc-lib
+pip install deepdoc-lib
 ```
 
 GPU (Linux x86_64 only):
 
 ```bash
-pip install "deepdoc-lib[gpu] @ git+https://github.com/xorbitsai/deepdoc-lib"
+pip install deepdoc-lib[gpu]
 ```
 
 Note: `onnxruntime` (CPU) and `onnxruntime-gpu` should not be installed together. If you're switching an existing environment to GPU, uninstall CPU ORT first:
@@ -24,22 +24,113 @@ pip install onnxruntime-gpu==1.19.2
 ### Parser Usage
 
 ```python
-from deepdoc import PdfParser, DocxParser, ExcelParser
-
-# 解析 PDF
-pdf_parser = PdfParser()
+from deepdoc import (
+    DocxParser,
+    ExcelParser,
+    HtmlParser,
+    PdfModelConfig,
+    PdfParser,
+    TokenizerConfig,
+)
+
+# Build configs
+# Method 1: Explicit configuration (offline mode)
+tokenizer_cfg = TokenizerConfig(
+    offline=True,
+    nltk_data_dir="/path/to/nltk_data",
+)
+pdf_model_cfg = PdfModelConfig(
+    vision_model_dir="/path/to/models/vision",
+    xgb_model_dir="/path/to/models/xgb",
+    model_provider="local",
+)
+
+# Method 2: Empty configuration (auto-download models and nltk_data)
+# tokenizer_cfg = TokenizerConfig()
+# pdf_model_cfg = PdfModelConfig()
+
+
+# Parse PDF
+pdf_parser = PdfParser(model_cfg=pdf_model_cfg, tokenizer_cfg=tokenizer_cfg)
 result = pdf_parser("document.pdf")
 
-# 解析 Word
-docx_parser = DocxParser()
-result = docx_parser("document.docx")
+# Parse DOCX / HTML (tokenizer only)
+docx_parser = DocxParser(tokenizer_cfg=tokenizer_cfg)
+html_parser = HtmlParser(tokenizer_cfg=tokenizer_cfg)
 
-# 解析 Excel
+# Parse Excel (no model/tokenizer dependency)
 excel_parser = ExcelParser()
 with open("data.xlsx", "rb") as f:
     result = excel_parser(f.read())
 ```
 
+Or use explicit env factories:
+
+```python
+tokenizer_cfg = TokenizerConfig.from_env()
+pdf_model_cfg = PdfModelConfig.from_env()
+pdf_parser = PdfParser(model_cfg=pdf_model_cfg, tokenizer_cfg=tokenizer_cfg)
+```
+
+Or rely on defaults (env + cache). Deepdoc will look for cached bundles under
+`$DEEPDOC_MODEL_HOME` (or `~/.cache/deepdoc`) and only download missing files
+when the provider allows remote access:
+
+```python
+pdf_parser = PdfParser()
+```
+
+env definitions:
+
+```bash
+# provider: auto | local | modelscope
+export DEEPDOC_MODEL_PROVIDER=auto
+
+# shared model cache root (default: ~/.cache/deepdoc)
+export DEEPDOC_MODEL_HOME=/path/to/deepdoc-models
+
+# optional bundle-specific local directories
+export DEEPDOC_VISION_MODEL_DIR=/path/to/vision
+export DEEPDOC_XGB_MODEL_DIR=/path/to/xgb
+
+# single combined ModelScope repo (all bundles in one repo)
+# (default: Xorbits/deepdoc)
+export DEEPDOC_MODELSCOPE_REPO=Xorbits/deepdoc
+# optional shared revision (default: master)
+export DEEPDOC_MODELSCOPE_REVISION=master
+
+# offline mode for tokenizer NLTK auto-download
+export DEEPDOC_OFFLINE=0
+
+# optional NLTK data controls for tokenizer
+export DEEPDOC_NLTK_DATA_DIR=/path/to/nltk_data
+```
+
+### Download model artifacts
+
+To pre-download all model bundles (vision/xgb/tokenizer) into the default cache directory (`~/.cache/deepdoc`), run:
+
+```bash
+deepdoc-download-models
+# or (from source checkout)
+python -m deepdoc.download_models
+```
+
+If you want to override the cache location, set `DEEPDOC_MODEL_HOME`:
+
+```bash
+export DEEPDOC_MODEL_HOME=./models
+deepdoc-download-models
+```
+
+By default this also downloads the required NLTK resources into `~/.cache/deepdoc/nltk_data` (or `$DEEPDOC_MODEL_HOME/nltk_data`) and the cached `cl100k_base` tiktoken file into `~/.cache/deepdoc/tiktoken_cache` (or `$DEEPDOC_MODEL_HOME/tiktoken_cache`). `deepdoc.common.token_utils` automatically points `TIKTOKEN_CACHE_DIR` at the same location unless you override it with `DEEPDOC_TIKTOKEN_CACHE_DIR` or `TIKTOKEN_CACHE_DIR`.
+
+If you want to skip either optional offline asset, use:
+
+```bash
+deepdoc-download-models --no-nltk --no-tiktoken
+```
+
 
 ### Vision Model Usage
 
@@ -50,15 +141,15 @@ from deepdoc import create_vision_model
 - Use Environment Variable
 
 ```bash
-# 视觉模型配置
+# Vision model configs
 export DEEPDOC_VISION_PROVIDER="qwen"
 export DEEPDOC_VISION_API_KEY="your-api-key"
 export DEEPDOC_VISION_MODEL="qwen-vl-max"
 export DEEPDOC_VISION_LANG="Chinese"
 export DEEPDOC_VISION_BASE_URL="http://your_base_url"
 
-# 其他配置
-export DEEPDOC_LIGHTEN=0  # 是否使用轻量模式
+# Other configs
+export DEEPDOC_LIGHTEN=0  # Whether to use lighten mode
 ```
 
 ``` python
@@ -99,4 +190,3 @@ vision_model = create_vision_model("/path/to/deepdoc_config.yaml")
 with open("image.jpg", "rb") as f:
     result = vision_model.describe_with_prompt(f.read())
 ```
-
 
@@ -19,6 +19,7 @@
 
 from .parser import *
 from .depend.simple_cv_model import *
+from .config import PdfModelConfig, TokenizerConfig, ParserRuntimeConfig
 from .llm_adapter import LLMAdapter, LLMType, vision_llm_chunk
 
 __all__ = [
@@ -32,6 +33,9 @@
     "JsonParser",
     "MarkdownParser",
     "TxtParser",
+    "TokenizerConfig",
+    "PdfModelConfig",
+    "ParserRuntimeConfig",
     # LLM Adapter exports
     "LLMAdapter",
     "LLMType",
 
@@ -11,6 +11,13 @@
 from .connection_utils import timeout
 from .config_utils import get_base_config, get_config_value
 from .settings import PARALLEL_DEVICES, check_and_install_torch
+from .model_store import (
+    resolve_bundle_dir,
+    resolve_tokenizer_dict_prefix,
+    resolve_vision_model_dir,
+    resolve_xgb_model_dir,
+    validate_bundle_dir,
+)
 
 __all__ = [
     # file_utils
@@ -35,4 +42,11 @@
     # settings
     "PARALLEL_DEVICES",
     "check_and_install_torch",
+
+    # model_store
+    "resolve_bundle_dir",
+    "resolve_tokenizer_dict_prefix",
+    "resolve_vision_model_dir",
+    "resolve_xgb_model_dir",
+    "validate_bundle_dir",
 ]