Skip to content

Commit 8a9a072

Browse files
authored
feat:Enhance RAG module with document processing and retrieval improvements (#395)
* feat: extend UniversalDocLoader to support additional file formats including Excel and HTML * feat: implement RAG module with document loading, splitting, and processing capabilities * feat: implement RAG module with document loading, splitting, and processing capabilities * feat: update Milvus configuration and enhance file processing logic in RAG service * feat: enhance RAG infrastructure with document processing, vector storage, and retrieval capabilities * feat: add progress tracking for RAG file processing and enhance worker pool status management * feat: enhance retrieval service with advanced search ranking and filtering capabilities * feat: enhance retrieval service with BM25 indexing and improved ranking parameters * feat: enhance retrieval service with BM25 indexing and improved ranking parameters * feat: implement Milvus client singleton management and refactor vector store interactions * feat: refactor file processing to use async session for database interactions * feat: enhance chunk processing with filtering, cleaning, and batch storage * feat: enhance API response models with additional fields and configuration
1 parent e1d61cb commit 8a9a072

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+5626
-114
lines changed

.claude/skills/backend-architect/SKILL.md

Lines changed: 556 additions & 25 deletions
Large diffs are not rendered by default.

.claude/skills/fastapi-templates/SKILL.md

Lines changed: 568 additions & 0 deletions
Large diffs are not rendered by default.

frontend/src/pages/KnowledgeBase/Detail/KnowledgeBaseDetail.tsx

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,11 @@ const KnowledgeBaseDetailPage: React.FC = () => {
9797
handleKeywordChange,
9898
} = useFetchData<KBFile>(
9999
(params) => id ? queryKnowledgeBaseFilesUsingGet(id, params) : Promise.resolve({ data: [] }),
100-
(file) => mapFileData(file, t)
100+
(file) => mapFileData(file, t),
101+
30000, // 30秒轮询间隔
102+
false, // 不自动轮询
103+
[], // 额外的轮询函数
104+
0 // pageOffset: Python 后端期望 page 从 1 开始,前端 current=1 时传 page=1
101105
);
102106

103107
// File table logic

frontend/src/pages/KnowledgeBase/Home/KnowledgeBasePage.tsx

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,11 @@ export default function KnowledgeBasePage() {
3131
handleKeywordChange,
3232
} = useFetchData<KnowledgeBaseItem>(
3333
queryKnowledgeBasesUsingPost,
34-
(kb) => mapKnowledgeBase(kb, false, t) // 在首页不显示索引模型和文本理解模型字段
34+
(kb) => mapKnowledgeBase(kb, false, t), // 在首页不显示索引模型和文本理解模型字段
35+
30000, // 30秒轮询间隔
36+
false, // 不自动轮询
37+
[], // 额外的轮询函数
38+
0 // pageOffset: Python 后端期望 page 从 1 开始,前端 current=1 时传 page=1
3539
);
3640

3741
useEffect(() => {

frontend/src/pages/KnowledgeBase/knowledge-base.api.ts

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
11
import { get, post, put, del } from "@/utils/request";
22

33
// 获取知识库列表
4-
export function queryKnowledgeBasesUsingPost(params: object) {
5-
return post("/api/knowledge-base/list", params);
4+
export function queryKnowledgeBasesUsingPost(params: any) {
5+
// 将前端的 size 参数映射为后端的 page_size
6+
const { size, ...rest } = params;
7+
return post("/api/knowledge-base/list", {
8+
...rest,
9+
page_size: size
10+
});
611
}
712

813
// 创建知识库
@@ -26,8 +31,22 @@ export function deleteKnowledgeBaseByIdUsingDelete(baseId: string) {
2631
}
2732

2833
// 获取知识生成文件列表
29-
export function queryKnowledgeBaseFilesUsingGet(baseId: string, params?: Record<string, string>) {
30-
return get(`/api/knowledge-base/${baseId}/files${params ? `?${new URLSearchParams(params).toString()}` : ""}`);
34+
export function queryKnowledgeBaseFilesUsingGet(baseId: string, params?: Record<string, any>) {
35+
if (!params) {
36+
return get(`/api/knowledge-base/${baseId}/files`);
37+
}
38+
// 将前端的 size 参数映射为后端的 page_size
39+
const { size, page, ...rest } = params;
40+
const queryParams = {
41+
page: page || 1,
42+
page_size: size || 10,
43+
...rest
44+
};
45+
return get(`/api/knowledge-base/${baseId}/files?${new URLSearchParams(
46+
Object.entries(queryParams)
47+
.filter(([_, v]) => v !== undefined && v !== null)
48+
.reduce((acc, [k, v]) => ({ ...acc, [k]: String(v) }), {})
49+
).toString()}`);
3150
}
3251

3352
// 添加文件到知识库
@@ -62,5 +81,5 @@ export function queryKnowledgeBaseFileDetailUsingGet(
6281
) {
6382
const page = params.page ?? 1;
6483
const size = params.size ?? 20;
65-
return get(`/api/knowledge-base/${knowledgeBaseId}/files/${ragFileId}?page=${page}&size=${size}`);
84+
return get(`/api/knowledge-base/${knowledgeBaseId}/files/${ragFileId}?page=${page}&page_size=${size}`);
6685
}

frontend/vite.config.ts

Lines changed: 44 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,54 @@ export default defineConfig({
1313
},
1414
server: {
1515
host: "0.0.0.0",
16-
proxy: {
17-
"^/api": {
18-
target: "http://localhost:8080", // 本地后端服务地址
16+
proxy: (() => {
17+
const pythonProxyConfig = {
18+
target: "http://localhost:18000",
1919
changeOrigin: true,
2020
secure: false,
21-
rewrite: (path) => path.replace(/^\/api/, "/api"),
22-
configure: (proxy, options) => {
23-
// proxy 是 'http-proxy' 的实例
24-
proxy.on("proxyReq", (proxyReq, req, res) => {
25-
// 可以在这里修改请求头
26-
proxyReq.removeHeader("referer");
27-
proxyReq.removeHeader("origin");
21+
configure: (proxy: { on: (event: string, handler: (arg: unknown) => void) => void }) => {
22+
proxy.on("proxyReq", (proxyReq: unknown) => {
23+
(proxyReq as { removeHeader: (name: string) => void }).removeHeader("referer");
24+
(proxyReq as { removeHeader: (name: string) => void }).removeHeader("origin");
2825
});
29-
proxy.on("proxyRes", (proxyRes, req, res) => {
30-
delete proxyRes.headers["set-cookie"];
31-
proxyRes.headers["cookies"] = ""; // 清除 cookies 头
26+
proxy.on("proxyRes", (proxyRes: unknown) => {
27+
const res = proxyRes as { headers: Record<string, unknown> };
28+
delete res.headers["set-cookie"];
29+
res.headers["cookies"] = "";
3230
});
3331
},
34-
},
35-
},
32+
};
33+
34+
const javaProxyConfig = {
35+
target: "http://localhost:8080",
36+
changeOrigin: true,
37+
secure: false,
38+
configure: (proxy: { on: (event: string, handler: (arg: unknown) => void) => void }) => {
39+
proxy.on("proxyReq", (proxyReq: unknown) => {
40+
(proxyReq as { removeHeader: (name: string) => void }).removeHeader("referer");
41+
(proxyReq as { removeHeader: (name: string) => void }).removeHeader("origin");
42+
});
43+
proxy.on("proxyRes", (proxyRes: unknown) => {
44+
const res = proxyRes as { headers: Record<string, unknown> };
45+
delete res.headers["set-cookie"];
46+
res.headers["cookies"] = "";
47+
});
48+
},
49+
};
50+
51+
// Python 服务: rag, synthesis, annotation, evaluation, models
52+
const pythonPaths = ["rag", "synthesis", "annotation", "knowledge-base", "data-collection", "evaluation", "models"];
53+
// Java 服务: data-management, knowledge-base
54+
const javaPaths = ["data-management", "operators", "cleansing"];
55+
56+
const proxy: Record<string, object> = {};
57+
for (const p of pythonPaths) {
58+
proxy[`/api/${p}`] = pythonProxyConfig;
59+
}
60+
for (const p of javaPaths) {
61+
proxy[`/api/${p}`] = javaProxyConfig;
62+
}
63+
return proxy;
64+
})(),
3665
},
3766
});

runtime/datamate-python/app/core/config.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,5 +77,12 @@ def build_database_url(self):
7777

7878
datamate_jwt_enable: bool = False
7979

80+
# Milvus 配置
81+
milvus_uri: str = "http://localhost:19530"
82+
milvus_token: str = ""
83+
84+
# 文件存储配置(共享文件系统)
85+
file_storage_path: str = "/data/files"
86+
8087
# 全局设置实例
8188
settings = Settings()

runtime/datamate-python/app/core/exception/codes.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,17 @@ def __init__(self):
7777
# ========== RAG 模块 ==========
7878
RAG_CONFIG_ERROR: Final = ErrorCode("rag.0001", "RAG configuration error", 400)
7979
RAG_KNOWLEDGE_BASE_NOT_FOUND: Final = ErrorCode("rag.0002", "Knowledge base not found", 404)
80-
RAG_MODEL_NOT_FOUND: Final = ErrorCode("rag.0003", "RAG model not found", 404)
81-
RAG_QUERY_FAILED: Final = ErrorCode("rag.0004", "RAG query failed", 500)
80+
RAG_KNOWLEDGE_BASE_ALREADY_EXISTS: Final = ErrorCode("rag.0003", "Knowledge base already exists", 400)
81+
RAG_KNOWLEDGE_BASE_NAME_INVALID: Final = ErrorCode("rag.0004", "Knowledge base name is invalid", 400)
82+
RAG_FILE_NOT_FOUND: Final = ErrorCode("rag.0005", "RAG file not found", 404)
83+
RAG_FILE_PROCESS_FAILED: Final = ErrorCode("rag.0006", "File processing failed", 500)
84+
RAG_FILE_PARSE_FAILED: Final = ErrorCode("rag.0007", "File parsing failed", 500)
85+
RAG_CHUNK_NOT_FOUND: Final = ErrorCode("rag.0008", "Chunk not found", 404)
86+
RAG_MODEL_NOT_FOUND: Final = ErrorCode("rag.0009", "RAG model not found", 404)
87+
RAG_QUERY_FAILED: Final = ErrorCode("rag.0010", "RAG query failed", 500)
88+
RAG_MILVUS_ERROR: Final = ErrorCode("rag.0011", "Milvus operation failed", 500)
89+
RAG_COLLECTION_NOT_FOUND: Final = ErrorCode("rag.0012", "Milvus collection not found", 404)
90+
RAG_EMBEDDING_FAILED: Final = ErrorCode("rag.0013", "Embedding generation failed", 500)
8291

8392
# ========== 配比模块 ==========
8493
RATIO_TASK_NOT_FOUND: Final = ErrorCode("ratio.0001", "Ratio task not found", 404)

runtime/datamate-python/app/db/models/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
ChunkUploadPreRequest
3333
)
3434

35+
from .knowledge_gen import KnowledgeBase, RagFile
36+
3537
__all__ = [
3638
"Dataset",
3739
"DatasetTag",
@@ -48,4 +50,6 @@
4850
"CategoryRelation",
4951
"OperatorRelease",
5052
"ChunkUploadPreRequest",
53+
"KnowledgeBase",
54+
"RagFile",
5155
]

runtime/datamate-python/app/db/models/base_entity.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from sqlalchemy import Column, String, TIMESTAMP, Text, JSON
1+
from sqlalchemy import Column, String, TIMESTAMP, Text
22
from sqlalchemy.orm import declarative_base
33
from sqlalchemy.sql import func
44

0 commit comments

Comments
 (0)