deepdoc/deepdoc_cli.py at main · sqhyz55/deepdoc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/env python3
"""
DeepDoc 命令行解析工具
支持解析 PDF、Word、PPT、Excel、图片等多种文档格式
"""

import argparse
import sys
import os
import warnings
from pathlib import Path

# 过滤警告
warnings.filterwarnings("ignore", category=DeprecationWarning)

# 添加当前目录到 Python 路径
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

from parser import TxtParser, MarkdownParser, JsonParser, HtmlParser, ExcelParser, PptParser, DocxParser, PdfParser
from depend.simple_cv_model import create_vision_model


def parse_document(file_path, output_file=None, vision_provider="qwen"):
    """解析单个文档"""
    file_path = Path(file_path)

    if not file_path.exists():
        print(f"❌ 文件不存在: {file_path}")
        return

    # 根据文件扩展名选择解析器
    ext = file_path.suffix.lower()

    print(f"📄 开始解析文件: {file_path}")
    print(f"📋 文件类型: {ext}")

    try:
        if ext == '.pdf':
            print("🔍 使用 PDF 解析器...")
            parser = PdfParser()
            result = parser(str(file_path))
        elif ext == '.docx':
            print("🔍 使用 Word 解析器...")
            parser = DocxParser()
            result = parser(str(file_path))
        elif ext == '.ppt' or ext == '.pptx':
            print("🔍 使用 PPT 解析器...")
            parser = PptParser()
            result = parser(str(file_path), from_page=0, to_page=100000)
        elif ext == '.xlsx' or ext == '.xls' or ext == '.csv':
            print("🔍 使用 Excel 解析器...")
            parser = ExcelParser()
            with open(file_path, 'rb') as f:
                result = parser(f.read())
        elif ext == '.txt':
            print("🔍 使用文本解析器...")
            parser = TxtParser()
            result = parser(str(file_path))
        elif ext == '.md':
            print("🔍 使用 Markdown 解析器...")
            parser = MarkdownParser()
            result = parser(str(file_path))
        elif ext == '.json':
            print("🔍 使用 JSON 解析器...")
            parser = JsonParser()
            result = parser(str(file_path))
        elif ext == '.html' or ext == '.htm':
            print("🔍 使用 HTML 解析器...")
            parser = HtmlParser()
            result = parser(str(file_path))
        elif ext in ['.jpg', '.jpeg', '.png', '.bmp', '.gif']:
            print(f"🔍 使用视觉模型解析图片 (提供商: {vision_provider})...")
            # 图片解析
            vision_model = create_vision_model(vision_provider)
            with open(file_path, 'rb') as f:
                result = vision_model.describe_with_prompt(f.read())
        else:
            print(f"❌ 不支持的文件格式: {ext}")
            print("✅ 支持的格式: PDF, DOCX, PPT/PPTX, XLSX/XLS/CSV, TXT, MD, JSON, HTML, JPG/PNG/BMP/GIF")
            return

        # 输出结果
        if output_file:
            print(f"💾 保存结果到: {output_file}")
            with open(output_file, 'w', encoding='utf-8') as f:
                if isinstance(result, list):
                    f.write(f"# DeepDoc 解析结果\n")
                    f.write(f"文件: {file_path}\n")
                    f.write(f"解析器: {type(parser).__name__}\n")
                    f.write(f"Chunk 数量: {len(result)}\n\n")

                    for i, item in enumerate(result):
                        f.write(f"## Chunk {i+1}\n")
                        f.write(str(item))
                        f.write("\n\n")
                else:
                    f.write(f"# DeepDoc 解析结果\n")
                    f.write(f"文件: {file_path}\n")
                    f.write(f"解析器: {type(parser).__name__}\n\n")
                    f.write(str(result))
            print(f"✅ 解析完成，结果已保存到: {output_file}")
        else:
            print("\n" + "="*50)
            print("📄 解析结果")
            print("="*50)

            if isinstance(result, list):
                print(f"📊 共解析出 {len(result)} 个 chunk")
                for i, item in enumerate(result[:5]):  # 只显示前5个chunk
                    print(f"\n--- Chunk {i+1} ---")
                    content = str(item)
                    if len(content) > 500:
                        print(content[:500] + "...")
                    else:
                        print(content)
                if len(result) > 5:
                    print(f"\n... 还有 {len(result) - 5} 个chunk")
            else:
                content = str(result)
                if len(content) > 1000:
                    print(content[:1000] + "...")
                else:
                    print(content)

            print("\n" + "="*50)
            print("✅ 解析完成")

    except Exception as e:
        print(f"❌ 解析失败: {e}")
        import traceback
        traceback.print_exc()


def main():
    parser = argparse.ArgumentParser(
        description='DeepDoc 文档解析工具 - 支持多种文档格式的智能解析',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
使用示例:
  python deepdoc_cli.py document.pdf                    # 解析 PDF 文件
  python deepdoc_cli.py document.docx -o result.txt     # 解析 Word 并保存结果
  python deepdoc_cli.py image.jpg --vision-provider openai  # 解析图片
  python deepdoc_cli.py data.xlsx -o excel_result.txt   # 解析 Excel 文件
        """
    )

    parser.add_argument('file', help='要解析的文件路径')
    parser.add_argument('-o', '--output', help='输出文件路径（可选）')
    parser.add_argument('--vision-provider', default='qwen',
                       choices=['openai', 'qwen', 'zhipu', 'ollama', 'gemini', 'anthropic'],
                       help='视觉模型提供商（用于图片解析，默认: qwen）')
    parser.add_argument('--version', action='version', version='DeepDoc CLI v1.0.0')

    args = parser.parse_args()

    print("🚀 DeepDoc 文档解析工具")
    print("="*50)

    parse_document(args.file, args.output, args.vision_provider)


if __name__ == '__main__':
    main()