Skip to content

Commit c35838d

Browse files
committed
fix bug for code_index
1 parent 0a339d2 commit c35838d

3 files changed

Lines changed: 140 additions & 83 deletions

File tree

tools/code_indexer.py

Lines changed: 123 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,11 @@
66
77
Features:
88
- Recursive file traversal
9-
- LLM-powered code similarity analysis
9+
- LLM-powered code similarity analysis using augmented LLM classes
1010
- JSON-based relationship storage
1111
- Configurable matching strategies
1212
- Progress tracking and error handling
13+
- Automatic LLM provider selection based on API key availability
1314
"""
1415

1516
import asyncio
@@ -22,6 +23,77 @@
2223
from dataclasses import dataclass, asdict
2324
from typing import List, Dict, Any
2425

26+
# MCP Agent imports for LLM
27+
from mcp_agent.workflows.llm.augmented_llm_anthropic import AnthropicAugmentedLLM
28+
from mcp_agent.workflows.llm.augmented_llm_openai import OpenAIAugmentedLLM
29+
import yaml
30+
31+
def get_preferred_llm_class(config_path: str = "mcp_agent.secrets.yaml"):
32+
"""
33+
Automatically select the LLM class based on API key availability in configuration.
34+
35+
Reads from YAML config file and returns AnthropicAugmentedLLM if anthropic.api_key
36+
is available, otherwise returns OpenAIAugmentedLLM.
37+
38+
Args:
39+
config_path: Path to the YAML configuration file
40+
41+
Returns:
42+
class: The preferred LLM class
43+
"""
44+
try:
45+
# Try to read the configuration file
46+
if os.path.exists(config_path):
47+
with open(config_path, "r", encoding="utf-8") as f:
48+
config = yaml.safe_load(f)
49+
50+
# Check for anthropic API key in config
51+
anthropic_config = config.get("anthropic", {})
52+
anthropic_key = anthropic_config.get("api_key", "")
53+
54+
if anthropic_key and anthropic_key.strip() and not anthropic_key == "":
55+
return AnthropicAugmentedLLM
56+
else:
57+
return OpenAIAugmentedLLM
58+
else:
59+
print(f"🤖 Config file {config_path} not found, using OpenAIAugmentedLLM")
60+
return OpenAIAugmentedLLM
61+
62+
except Exception as e:
63+
print(f"🤖 Error reading config file {config_path}: {e}")
64+
print("🤖 Falling back to OpenAIAugmentedLLM")
65+
return OpenAIAugmentedLLM
66+
67+
68+
def get_default_models(config_path: str = "mcp_agent.config.yaml"):
69+
"""
70+
Get default models from configuration file.
71+
72+
Args:
73+
config_path: Path to the configuration file
74+
75+
Returns:
76+
dict: Dictionary with 'anthropic' and 'openai' default models
77+
"""
78+
try:
79+
if os.path.exists(config_path):
80+
with open(config_path, "r", encoding="utf-8") as f:
81+
config = yaml.safe_load(f)
82+
83+
anthropic_model = config.get("anthropic", {}).get(
84+
"default_model", "claude-sonnet-4-20250514"
85+
)
86+
openai_model = config.get("openai", {}).get("default_model", "o3-mini")
87+
88+
return {"anthropic": anthropic_model, "openai": openai_model}
89+
else:
90+
print(f"Config file {config_path} not found, using default models")
91+
return {"anthropic": "claude-sonnet-4-20250514", "openai": "o3-mini"}
92+
93+
except Exception as e:
94+
print(f"Error reading config file {config_path}: {e}")
95+
return {"anthropic": "claude-sonnet-4-20250514", "openai": "o3-mini"}
96+
2597

2698
@dataclass
2799
class FileRelationship:
@@ -78,6 +150,7 @@ def __init__(
78150
self.indexer_config_path = indexer_config_path
79151
self.api_config = self._load_api_config()
80152
self.indexer_config = self._load_indexer_config()
153+
self.default_models = get_default_models("mcp_agent.config.yaml")
81154

82155
# Use config paths if not provided as parameters
83156
paths_config = self.indexer_config.get("paths", {})
@@ -301,7 +374,7 @@ def _load_indexer_config(self) -> Dict[str, Any]:
301374
return {}
302375

303376
async def _initialize_llm_client(self):
304-
"""Initialize LLM client based on configured provider"""
377+
"""Initialize LLM client (Anthropic or OpenAI) based on API key availability"""
305378
if self.llm_client is not None:
306379
return self.llm_client, self.llm_client_type
307380

@@ -312,89 +385,65 @@ async def _initialize_llm_client(self):
312385
self.llm_client_type = "mock"
313386
return "mock", "mock"
314387

315-
# Try configured provider first
316-
if self.model_provider.lower() == "anthropic":
317-
try:
318-
anthropic_key = self.api_config.get("anthropic", {}).get("api_key")
319-
if anthropic_key:
320-
from anthropic import AsyncAnthropic
321-
322-
client = AsyncAnthropic(api_key=anthropic_key)
323-
# Test connection
324-
await client.messages.create(
325-
model="claude-sonnet-4-20250514",
326-
max_tokens=10,
327-
messages=[{"role": "user", "content": "test"}],
328-
)
329-
self.logger.info("Using Anthropic API for code analysis")
330-
self.llm_client = client
331-
self.llm_client_type = "anthropic"
332-
return client, "anthropic"
333-
except Exception as e:
334-
self.logger.warning(f"Configured Anthropic API unavailable: {e}")
388+
# Check which API has available key and try that first
389+
anthropic_key = self.api_config.get("anthropic", {}).get("api_key", "")
390+
openai_key = self.api_config.get("openai", {}).get("api_key", "")
335391

336-
elif self.model_provider.lower() == "openai":
392+
# Try Anthropic API first if key is available
393+
if anthropic_key and anthropic_key.strip():
337394
try:
338-
openai_key = self.api_config.get("openai", {}).get("api_key")
339-
if openai_key:
340-
from openai import AsyncOpenAI
341-
342-
client = AsyncOpenAI(api_key=openai_key)
343-
# Test connection
344-
await client.chat.completions.create(
345-
model="gpt-3.5-turbo",
346-
max_tokens=10,
347-
messages=[{"role": "user", "content": "test"}],
348-
)
349-
self.logger.info("Using OpenAI API for code analysis")
350-
self.llm_client = client
351-
self.llm_client_type = "openai"
352-
return client, "openai"
353-
except Exception as e:
354-
self.logger.warning(f"Configured OpenAI API unavailable: {e}")
355-
356-
# Fallback: try other provider
357-
self.logger.info("Trying fallback provider...")
358-
359-
# Try Anthropic as fallback
360-
try:
361-
anthropic_key = self.api_config.get("anthropic", {}).get("api_key")
362-
if anthropic_key:
363395
from anthropic import AsyncAnthropic
364396

365397
client = AsyncAnthropic(api_key=anthropic_key)
398+
# Test connection with default model from config
366399
await client.messages.create(
367-
model="claude-sonnet-4-20250514",
400+
model=self.default_models["anthropic"],
368401
max_tokens=10,
369402
messages=[{"role": "user", "content": "test"}],
370403
)
371-
self.logger.info("Using Anthropic API as fallback")
404+
self.logger.info(
405+
f"Using Anthropic API with model: {self.default_models['anthropic']}"
406+
)
372407
self.llm_client = client
373408
self.llm_client_type = "anthropic"
374409
return client, "anthropic"
375-
except Exception as e:
376-
self.logger.warning(f"Anthropic fallback failed: {e}")
410+
except Exception as e:
411+
self.logger.warning(f"Anthropic API unavailable: {e}")
377412

378-
# Try OpenAI as fallback
379-
try:
380-
openai_key = self.api_config.get("openai", {}).get("api_key")
381-
if openai_key:
413+
# Try OpenAI API if Anthropic failed or key not available
414+
if openai_key and openai_key.strip():
415+
try:
382416
from openai import AsyncOpenAI
383417

384-
client = AsyncOpenAI(api_key=openai_key)
418+
# Handle custom base_url if specified
419+
openai_config = self.api_config.get("openai", {})
420+
base_url = openai_config.get("base_url")
421+
422+
if base_url:
423+
client = AsyncOpenAI(api_key=openai_key, base_url=base_url)
424+
else:
425+
client = AsyncOpenAI(api_key=openai_key)
426+
427+
# Test connection with default model from config
385428
await client.chat.completions.create(
386-
model="gpt-3.5-turbo",
429+
model=self.default_models["openai"],
387430
max_tokens=10,
388431
messages=[{"role": "user", "content": "test"}],
389432
)
390-
self.logger.info("Using OpenAI API as fallback")
433+
self.logger.info(
434+
f"Using OpenAI API with model: {self.default_models['openai']}"
435+
)
436+
if base_url:
437+
self.logger.info(f"Using custom base URL: {base_url}")
391438
self.llm_client = client
392439
self.llm_client_type = "openai"
393440
return client, "openai"
394-
except Exception as e:
395-
self.logger.warning(f"OpenAI fallback failed: {e}")
441+
except Exception as e:
442+
self.logger.warning(f"OpenAI API unavailable: {e}")
396443

397-
raise ValueError("No available LLM API for code analysis")
444+
raise ValueError(
445+
"No available LLM API - please check your API keys in configuration"
446+
)
398447

399448
async def _call_llm(
400449
self, prompt: str, system_prompt: str = None, max_tokens: int = None
@@ -426,7 +475,7 @@ async def _call_llm(
426475

427476
if client_type == "anthropic":
428477
response = await client.messages.create(
429-
model="claude-sonnet-4-20250514",
478+
model=self.default_models["anthropic"],
430479
system=system_prompt,
431480
messages=[{"role": "user", "content": prompt}],
432481
max_tokens=max_tokens,
@@ -451,7 +500,7 @@ async def _call_llm(
451500
]
452501

453502
response = await client.chat.completions.create(
454-
model="gpt-4-1106-preview",
503+
model=self.default_models["openai"],
455504
messages=messages,
456505
max_tokens=max_tokens,
457506
temperature=self.llm_temperature,
@@ -1043,7 +1092,7 @@ async def process_repository(self, repo_path: Path) -> RepoIndex:
10431092
if r.confidence_score > self.high_confidence_threshold
10441093
]
10451094
),
1046-
"analyzer_version": "1.3.0", # Updated version to reflect concurrent support
1095+
"analyzer_version": "1.4.0", # Updated version to reflect augmented LLM support
10471096
"pre_filtering_enabled": self.enable_pre_filtering,
10481097
"files_before_filtering": len(all_files),
10491098
"files_after_filtering": len(files_to_analyze),
@@ -1373,7 +1422,7 @@ def generate_statistics_report(self, statistics_data: List[Dict[str, Any]]) -> s
13731422
# Build statistics report
13741423
statistics_report = {
13751424
"report_generation_time": datetime.now().isoformat(),
1376-
"analyzer_version": "1.3.0",
1425+
"analyzer_version": "1.4.0",
13771426
"configuration_used": {
13781427
"config_file": self.indexer_config_path,
13791428
"concurrent_analysis_enabled": self.enable_concurrent_analysis,
@@ -1481,11 +1530,12 @@ async def main():
14811530
"""Main function to run the code indexer with full configuration support"""
14821531

14831532
# Configuration - can be overridden by config file
1484-
config_file = "deepcode-mcp/tools/indexer_config.yaml"
1533+
config_file = "DeepCode/tools/indexer_config.yaml"
1534+
api_config_file = "DeepCode/mcp_agent.secrets.yaml"
14851535

14861536
# You can override these parameters or let them be read from config
1487-
code_base_path = None # Will use config file value if None
1488-
output_dir = None # Will use config file value if None
1537+
code_base_path = "DeepCode/deepcode_lab/papers/1/code_base/" # Will use config file value if None
1538+
output_dir = "DeepCode/deepcode_lab/papers/1/indexes/" # Will use config file value if None
14891539

14901540
# Target structure - this should be customized for your specific project
14911541
target_structure = """
@@ -1526,21 +1576,24 @@ async def main():
15261576

15271577
print("🚀 Starting Code Indexer with Enhanced Configuration Support")
15281578
print(f"📋 Configuration file: {config_file}")
1579+
print(f"🔑 API configuration file: {api_config_file}")
15291580

15301581
# Create indexer with full configuration support
15311582
try:
15321583
indexer = CodeIndexer(
15331584
code_base_path=code_base_path, # None = read from config
15341585
target_structure=target_structure, # Required - project specific
15351586
output_dir=output_dir, # None = read from config
1587+
config_path=api_config_file, # API configuration file
15361588
indexer_config_path=config_file, # Configuration file
15371589
enable_pre_filtering=True, # Can be overridden in config
15381590
)
15391591

15401592
# Display configuration information
15411593
print(f"📁 Code base path: {indexer.code_base_path}")
15421594
print(f"📂 Output directory: {indexer.output_dir}")
1543-
print(f"🤖 Model provider: {indexer.model_provider}")
1595+
print(f"🤖 Default models: Anthropic={indexer.default_models['anthropic']}, OpenAI={indexer.default_models['openai']}")
1596+
print(f"🔧 Preferred LLM: {get_preferred_llm_class(api_config_file).__name__}")
15441597
print(
15451598
f"⚡ Concurrent analysis: {'enabled' if indexer.enable_concurrent_analysis else 'disabled'}"
15461599
)

workflows/agent_orchestration_engine.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -414,7 +414,7 @@ async def github_repo_download(search_result: str, paper_dir: str, logger) -> st
414414
)
415415

416416

417-
async def paper_reference_analyzer(analysis_result: str, logger) -> str:
417+
async def paper_reference_analyzer(paper_dir: str, logger) -> str:
418418
"""
419419
Run the paper reference analysis and GitHub repository workflow.
420420
@@ -428,22 +428,26 @@ async def paper_reference_analyzer(analysis_result: str, logger) -> str:
428428
reference_analysis_agent = Agent(
429429
name="ReferenceAnalysisAgent",
430430
instruction=PAPER_REFERENCE_ANALYZER_PROMPT,
431-
server_names=["filesystem", "brave", "fetch"],
431+
server_names=["filesystem", "fetch"],
432432
)
433+
message = f"""Analyze the research paper in directory: {paper_dir}
434+
435+
Please locate and analyze the markdown (.md) file containing the research paper. **Focus specifically on the References/Bibliography section** to identify and analyze the 5 most relevant references that have GitHub repositories.
436+
437+
Focus on:
438+
1. **References section analysis** - Extract all citations from the References/Bibliography part
439+
2. References with high-quality GitHub implementations
440+
3. Papers cited for methodology, algorithms, or core techniques
441+
4. Related work that shares similar technical approaches
442+
5. Implementation references that could provide code patterns
443+
444+
Goal: Find the most valuable GitHub repositories from the paper's reference list for code implementation reference."""
433445

434446
async with reference_analysis_agent:
435447
print("Reference analyzer: Connected to server, analyzing references...")
436448
analyzer = await reference_analysis_agent.attach_llm(get_preferred_llm_class())
437449

438-
# Set higher token output for reference analysis
439-
reference_params = RequestParams(
440-
max_tokens=30000,
441-
temperature=0.2,
442-
)
443-
444-
reference_result = await analyzer.generate_str(
445-
message=analysis_result, request_params=reference_params
446-
)
450+
reference_result = await analyzer.generate_str(message=message)
447451
return reference_result
448452

449453

@@ -576,7 +580,7 @@ async def orchestrate_reference_intelligence_agent(
576580

577581
# Execute reference analysis
578582
reference_result = await paper_reference_analyzer(
579-
dir_info["standardized_text"], logger
583+
dir_info["paper_dir"], logger
580584
)
581585

582586
# Save reference analysis result

workflows/codebase_index_workflow.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -711,7 +711,7 @@ async def main():
711711
logger = logging.getLogger(__name__)
712712

713713
# 测试参数
714-
paper_dir = "./deepcode_lab/papers/2"
714+
paper_dir = "./deepcode_lab/papers/1"
715715
initial_plan_path = os.path.join(paper_dir, "initial_plan.txt")
716716

717717
# 运行工作流

0 commit comments

Comments
 (0)