namefi-openhands/summarize_reports.py at main · CambioML/namefi-openhands · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
#!/usr/bin/env python3

import argparse
import glob
import os
import sys
import yaml
# import tiktoken # No longer needed for Gemini summarization pre-check
# import openai # Switching to Google
import google.generativeai as genai
import json
import csv
import re
from datetime import datetime
from dotenv import load_dotenv
from collections import defaultdict
import time

# Load environment variables from .env
load_dotenv()

GEMINI_MODEL_NAME = "models/gemini-2.5-flash-preview-05-20"
# GEMINI_MODEL_NAME = "models/gemini-2.5-pro-preview-05-06"

# Read ClickUp Team ID from environment variable or use default
CLICKUP_TEAM_ID = os.environ.get('CLICKUP_TEAM_ID', "9009140026")

# Initialize Google Generative AI client
if not os.environ.get("GOOGLE_API_KEY"):
    print("Error: GOOGLE_API_KEY environment variable not set")
    sys.exit(1)

try:
    genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
except Exception as e:
    print(f"Error configuring Google Generative AI: {e}")
    sys.exit(1)

# client = openai.OpenAI() # Remove OpenAI client

def count_gemini_model_tokens(text, model_name=GEMINI_MODEL_NAME):
    """Counts tokens using the google-generativeai library for a specific model."""
    try:
        model = genai.GenerativeModel(model_name)
        return model.count_tokens(text).total_tokens
    except Exception as e:
        print(f"Warning: Could not count Gemini tokens for model {model_name}: {e}")
        return -1 # Return an indicator of error

def load_user_map(user_map_path):
    """Load user mappings from YAML file."""
    try:
        with open(user_map_path, 'r') as file:
            user_map = yaml.safe_load(file)
        return user_map
    except Exception as e:
        print(f"Error loading user map: {e}")
        return None

def replace_user_mentions(text, user_map):
    """
    Replace GitHub and ClickUp user references with appropriate links (mailto for emails, Slack for others).
    Handles common formats while preserving existing links.
    """
    if not user_map:
        return text

    github_map = user_map.get('github_user_to_slack_id', {})
    clickup_map = user_map.get('clickup_user_to_slack_id', {})

    # Create a more structured user detail map, prioritizing ClickUp email for mailto links
    user_details_by_slack_id = {}
    for email, slack_id in clickup_map.items():
        if slack_id:
            name_part = email.split('@')[0]
            derived_name = f"{name_part.split('.')[0].capitalize()} {name_part.split('.', 1)[-1].capitalize() if '.' in name_part else ''}".strip()
            if not derived_name:
                 derived_name = name_part.capitalize()
            user_details_by_slack_id[slack_id] = {
                'slack_id': slack_id,
                'email': email,
                'derived_name': derived_name,
                'github_user': None
            }

    for github_user, slack_id in github_map.items():
        if slack_id:
            if slack_id in user_details_by_slack_id:
                user_details_by_slack_id[slack_id]['github_user'] = github_user
            else:
                # User only has GitHub mapping, no direct email from ClickUp map
                user_details_by_slack_id[slack_id] = {
                    'slack_id': slack_id,
                    'email': None, # No email to create mailto link
                    'derived_name': github_user, # Use GitHub handle as name
                    'github_user': github_user
                }

    # Process replacements iteratively
    for details in user_details_by_slack_id.values():
        # Determine the primary mention: mailto if email exists, otherwise Slack mention
        primary_mention_text = details['derived_name'] or details['github_user']
        if details['email']:
            mention_link = f"mailto:{details['email']}"
            replacement_string = f"[{primary_mention_text}]({mention_link})"
        else:
            mention_link = f"<@{details['slack_id']}>" # Fallback to Slack mention
            replacement_string = mention_link # Slack mentions are not typically hyperlinked text

        patterns_to_replace = []

        # Add pattern for ClickUp derived name (e.g., "First Last")
        if details['derived_name'] and details['derived_name'] != details['github_user']:
             patterns_to_replace.append(rf'\b{re.escape(details["derived_name"])}\b')

        # Add pattern for ClickUp email
        if details['email']:
            patterns_to_replace.append(rf'\b{re.escape(details["email"])}\b')

        # Add GitHub username patterns if they haven't been effectively covered by derived_name
        if details['github_user']:
            gh_user_escaped = re.escape(details['github_user'])
            # Standard @mention
            patterns_to_replace.append(rf'@{gh_user_escaped}\b')
            # "by username" format
            patterns_to_replace.append(rf'\bby {gh_user_escaped}\b')
            # Markdown linked @mention: e.g. [@username](github.com/username)
            # We want to replace "@username" part with the mailto/Slack link, keeping the GitHub URL
            # This specific replacement is complex due to nested links, handle with care or simplify if needed
            # For now, the generic replacement below might handle some cases if the name matches

        # Apply replacements, being careful not to corrupt URLs or existing markdown links
        parts = re.split(r'(\[[^\]]*\]\([^)]*\)|https?://\S+)', text) # Split by existing markdown links or raw URLs
        new_parts = []
        for i, part in enumerate(parts):
            is_link_or_url = i % 2 == 1 # Existing links/URLs are at odd indices
            if not is_link_or_url and part:
                processed_part = part
                for pattern in patterns_to_replace:
                    # Replace standalone occurrences of the pattern
                    processed_part = re.sub(pattern, replacement_string, processed_part)
                new_parts.append(processed_part)
            elif part: # Keep existing links/URLs as they are
                new_parts.append(part)
        text = ''.join(new_parts)

    return text

def extract_json_data(file_path):
    """Extract and summarize relevant data from JSON files."""
    file_name = os.path.basename(file_path)
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
            except json.JSONDecodeError:
                return f"Could not decode JSON from {file_name}"

        summary = f"## Summary from {file_name}\\n\\n"

        if 'tasks_raw_data' in file_name:
            if isinstance(data, list):
                completed_tasks = []
                wip_tasks = []
                for task in data:
                    if isinstance(task, dict):
                        task_name = task.get('name', 'Untitled Task')
                        task_status = task.get('status', {}).get('status', 'Unknown').lower()
                        task_url = task.get('url', '')
                        task_id = task.get('id', 'Unknown')
                        item = f"- [{task_name}]({task_url}) (ID: {task_id})"
                        if 'done' in task_status or 'complete' in task_status or 'closed' in task_status:
                            completed_tasks.append(item)
                        else:
                            wip_tasks.append(item)

                if completed_tasks:
                    summary += "### Completed Tasks:\n" + '\n'.join(completed_tasks) + "\n\n"
                if wip_tasks:
                    summary += "### Other Tasks (WIP/Open):\n" + '\n'.join(wip_tasks[:30]) + "\n" # Limit WIP for brevity
                    if len(wip_tasks) > 30:
                        summary += f"*...and {len(wip_tasks) - 30} more open tasks*\n"
                if not completed_tasks and not wip_tasks:
                     summary += "No relevant task data found.\n"
            else:
                summary += "JSON structure not recognized as task list.\\n"

        elif 'commit_raw_data' in file_name: # Handles both *_commit_raw_data.json and *_repo_commits_raw_data.json
            section_title = "### All Repository Commits (from JSON):\\n" if 'repo_commits_raw_data' in file_name else "### Recent Commits (from PRs):\\n"
            summary += section_title
            if isinstance(data, list):
                commit_count = 0
                for i, commit_info in enumerate(data):
                    if isinstance(commit_info, dict) and 'commit' in commit_info:
                        commit = commit_info['commit']
                        sha = commit_info.get('sha', 'Unknown SHA')[:7] # Short SHA
                        message = commit.get('message', 'No message').split('\n')[0] # First line
                        author_name = commit.get('author', {}).get('name', 'Unknown Author')
                        html_url = commit_info.get('html_url', '')
                        summary += f"- [{sha}]({html_url}): {message} (by {author_name})\n"
                        commit_count += 1
                        if commit_count >= 30: # Limit commits shown
                            break
                if len(data) > 30:
                     summary += f"*...and {len(data) - 30} more commits*\n"
                if commit_count == 0:
                    summary += "No commit data found.\n"
            else:
                summary += "JSON structure not recognized as commit list.\n"

        elif 'pr_raw_data' in file_name:
            if isinstance(data, list):
                summary += "### Recent Pull Requests:\n"
                pr_count = 0
                for i, pr in enumerate(data):
                    if isinstance(pr, dict):
                        pr_number = pr.get('number', '#?')
                        title = pr.get('title', 'No Title')
                        user = pr.get('user', {}).get('login', 'Unknown User')
                        html_url = pr.get('html_url', '')
                        state = pr.get('state', 'unknown')
                        summary += f"- [PR #{pr_number}]({html_url}): {title} (by {user}, state: {state})\n"
                        pr_count += 1
                        if pr_count >= 30: # Limit PRs shown
                            break
                if len(data) > 30:
                     summary += f"*...and {len(data) - 30} more pull requests*\n"
                if pr_count == 0:
                    summary += "No pull request data found.\n"
            else:
                summary += "JSON structure not recognized as PR list.\n"
        else:
             summary += "Unrecognized JSON file type.\n"

        return summary
    except Exception as e:
        return f"Error extracting JSON data from {file_name}: {e}"

def extract_csv_data(file_path):
    """Extract and summarize relevant data from CSV files."""
    file_name = os.path.basename(file_path)
    try:
        rows = []
        with open(file_path, 'r', encoding='utf-8') as f:
            try:
                # Handle potential empty files or different dialects
                sample = f.read(1024)
                if not sample:
                    return f"CSV file {file_name} is empty."
                f.seek(0)
                dialect = csv.Sniffer().sniff(sample)
                reader = csv.DictReader(f, dialect=dialect)
                headers = reader.fieldnames
                if not headers:
                     return f"Could not determine headers for CSV {file_name}."
                rows = list(reader)
            except csv.Error as csv_e:
                 return f"Error reading CSV {file_name}: {csv_e}"
            except Exception as read_e: # Catch other file reading issues
                return f"Error opening or reading CSV {file_name}: {read_e}"

        if not rows:
            return f"No data rows found in CSV file {file_name}."

        summary = f"## Summary from {file_name}\\n\\n"
        processed_count = 0
        max_items = 30 # Limit items processed per CSV

        if 'tasks_updates' in file_name:
            summary += "### Recent Task Updates:\\n"
            # Adjust column names based on typical ClickUp CSV exports if needed
            task_id_col = next((h for h in headers if 'task id' in h.lower()), None)
            update_col = next((h for h in headers if 'comment' in h.lower() or 'update' in h.lower()), 'Unknown Update')
            author_col = next((h for h in headers if 'author' in h.lower()), 'Unknown Author')

            for row in rows:
                task_id = row.get(task_id_col, '')
                update_text = row.get(update_col, 'No update text')
                author = row.get(author_col, 'Unknown')
                if task_id and update_text:
                    summary += f"- Task {task_id}: {update_text[:150]}... (by {author})\n"
                    processed_count += 1
                    if processed_count >= max_items: break

        elif 'commit_data' in file_name: # Handles both *_commit_data.csv and *_repo_commits_data.csv
            section_title = "### All Repository Commits (from CSV):\\n" if 'repo_commits_data' in file_name else "### Commit Overview (from PRs):\\n"
            summary += section_title
            sha_col = next((h for h in headers if 'sha' in h.lower()), None)
            msg_col = next((h for h in headers if 'message' in h.lower() or 'title' in h.lower()), None) # Check for 'title' as well
            author_col = next((h for h in headers if 'author' in h.lower()), None)

            for row in rows:
                sha = row.get(sha_col, 'Unknown SHA')[:7]
                message = row.get(msg_col, 'No message').split('\n')[0]
                author = row.get(author_col, 'Unknown Author')
                if sha and message:
                    summary += f"- {sha}: {message} (by {author})\n"
                    processed_count += 1
                    if processed_count >= max_items: break

        elif 'pr_data' in file_name: # Simple PR list
            summary += "### Pull Request Overview:\n"
            num_col = next((h for h in headers if 'number' in h.lower()), None)
            title_col = next((h for h in headers if 'title' in h.lower()), None)
            user_col = next((h for h in headers if 'user' in h.lower()), None)
            state_col = next((h for h in headers if 'state' in h.lower()), None)

            for row in rows:
                number = row.get(num_col, '#?')
                title = row.get(title_col, 'No Title')
                user = row.get(user_col, 'Unknown User')
                state = row.get(state_col, 'unknown')
                if number and title:
                     summary += f"- PR #{number}: {title} (by {user}, state: {state})\n"
                     processed_count += 1
                     if processed_count >= max_items: break
        else:
            summary += f"Unrecognized CSV file type ({file_name}). Generic row sample:\n"
            for i, row in enumerate(rows):
                summary += f"- Row {i+1}: {str(dict(row))[:200]}...\n"
                processed_count += 1
                if processed_count >= 5: break # Limit generic sample

        if len(rows) > processed_count:
            summary += f"\n*...processed {processed_count} of {len(rows)} rows*\n"
        if processed_count == 0:
            summary += "No relevant data extracted.\n"

        return summary
    except Exception as e:
        return f"Error extracting CSV data from {file_name}: {e}"

# def count_tokens(text, model="gpt-4o-mini"): # Remove or replace with Gemini counter if needed
#     """Count tokens using tiktoken for OpenAI models."""
#     try:
#         encoding = tiktoken.encoding_for_model(model)
#         tokens = encoding.encode(text)
#         return len(tokens)
#     except Exception as e:
#         print(f"Warning: Error counting tokens: {e}")
#         return 0

def summarize_with_gemini(content, model=GEMINI_MODEL_NAME):
    """Generate a summary using the specified Gemini model, streaming the response."""
    global CLICKUP_TEAM_ID # Make sure we can access the global team ID
    try:
        gemini_model = genai.GenerativeModel(model)
        prompt = (
            "You are a highly skilled executive summary writer for a software development team. "
            "Your task is to create an extremely concise, structured summary of team updates from ClickUp reports and GitHub activity. "
            f"The ClickUp Team ID to use for constructing links is: {CLICKUP_TEAM_ID}. "
            "The ENTIRE summary, including all sections, MUST NOT exceed 800 words. Be aggressive in summarization and combination.\n\n"
            "Your summary MUST follow this exact structure, using unordered lists (hyphens or asterisks) for bullet points:\n\n"
            "# Executive Summary\n"
            "Provide a highly specific and concise 1-2 sentence summary of the most critical achievements and overall progress. Focus on unique, impactful outcomes.\n\n"
            "# Features\n"
            f"- Extremely brief description of the new feature or enhancement. [Firstname1](mailto:email1) [NFI-123](https://app.clickup.com/t/{CLICKUP_TEAM_ID}/NFI-123) [PR456](link_to_pr)\n"
            "  (Combine related items. Rank by importance. List all relevant links directly after contributors.)\n\n"
            "# Bugs Fixed\n"
            f"- Extremely brief description of the bug and its solution. [Firstname1](mailto:email1) [NFI-789](https://app.clickup.com/t/{CLICKUP_TEAM_ID}/NFI-789) [PR101](link_to_pr)\n"
            "  (Combine related items. Rank by importance. List all relevant links directly after contributors.)\n\n"
            "# Others\n"
            "- Extremely brief description of other important updates (infrastructure, documentation, research). [Firstname1](mailto:email1) [Link1](link) [Link2](link)\n"
            "  (Combine related items. Rank by importance. List all relevant links directly after contributors.)\n\n"
            "Guidelines for all sections:\n"
            "- AGGRESSIVELY COMBINE closely related points into single bullet points to save space.\n"
            "- Each bullet point must start with an extremely brief description. Immediately follow with all unique contributors involved, using their FIRST NAME ONLY (e.g., [Sami](mailto:sami@example.com)). Then, list relevant ClickUp Task IDs and GitHub PRs. All these on the same line.\n"
            f"- For ClickUp Task IDs: If an NFI-{{number}} style Custom ID (e.g., NFI-123) is present in the input for a task, YOU MUST use that Custom ID as the link text AND in the URL, like: [NFI-123](https://app.clickup.com/t/{CLICKUP_TEAM_ID}/NFI-123). If no NFI-{{number}} Custom ID is available, use its short alphanumeric ID (e.g., 86a8bd6tv) as the link text AND in the URL, like: [86a8bd6tv](https://app.clickup.com/t/{CLICKUP_TEAM_ID}/86a8bd6tv).\n"
            "- Do NOT use prefixes like 'ClickUp:', 'GitHub:', 'Contributors:', or 'Links:'.\n"
            "- For GitHub PRs, use format like [PR #123](github_pr_url). Preserve ALL original full URLs for GitHub links.\n"
            "- Ensure contributor FIRST NAMES are hyperlinked with their mailto:email if available from the input. If no email, just state their first name.\n"
            "- Do NOT use backticks around contributor names.\n"
            "- Use unordered lists (e.g., starting with '-' or '* ') for all items.\n"
            "- STRICT ADHERENCE TO THE 800-WORD LIMIT FOR THE ENTIRE SUMMARY IS CRITICAL.\n\n"
            "---\n"
            "Here is the combined report data to summarize:\n\n"
            f"{content}"
        )

        start_time = time.time()
        print("Sending request to Gemini model (streaming enabled)...")

        response_stream = gemini_model.generate_content(
            prompt,
            generation_config=genai.types.GenerationConfig(temperature=0.15),
            stream=True
        )

        full_response_text = ""
        print("\n--- Gemini Summary Stream --- QTLABIO ---")
        for chunk in response_stream:
            if chunk.text:
                print(chunk.text, end="", flush=True)
                full_response_text += chunk.text
        print("\n--- End of Gemini Summary Stream --- QTLABIO ---")

        end_time = time.time()
        print(f"\nGemini summarization (streaming) took {end_time - start_time:.2f} seconds.")

        if not full_response_text:
            feedback_info = "No specific feedback available from stream object."
            try:
                 if hasattr(response_stream, 'prompt_feedback') and response_stream.prompt_feedback:
                     safety_ratings = response_stream.prompt_feedback.safety_ratings
                     feedback_info = f"Safety Ratings: {safety_ratings}"
                 elif hasattr(response_stream, 'parts') and response_stream.parts and not response_stream.parts[0].text:
                     feedback_info = "Stream finished but produced no text, possibly blocked."
            except Exception as feedback_ex:
                print(f"Error accessing stream feedback: {feedback_ex}")

            error_message = (f"Error: Gemini response stream was empty or potentially blocked. {feedback_info}")
            print(error_message)
            return error_message

        return full_response_text

    except Exception as e:
        print(f"Error during Gemini summarization: {e}")
        return f"Error generating summary: {e}"

def determine_report_context(timestamp_keys):
    """Analyzes timestamp keys to determine a common report type and end date/time."""
    if not timestamp_keys:
        return "custom_reports", "Multiple Time Periods", "Multiple Reports"

    # Check for a common prefix like "Weekly", "Daily", or "Xh"
    # Example key: Weekly-ClickUp-2025-05-11-10-33-09
    # Example key: Weekly-2025-05-11-10-33-09
    # Example key: 168h-ClickUp-2025-05-11-10-33-09 (assuming h for hours)

    common_type = None
    report_dates = []
    type_counts = defaultdict(int)

    for key in timestamp_keys:
        # Try to extract type (Weekly, Daily, Xh) and date
        match_weekly = re.match(r'Weekly-(?:ClickUp-)?(\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2})', key)
        match_daily = re.match(r'Daily-(?:ClickUp-)?(\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2})', key)
        match_xh = re.match(r'(\d+)h-(?:ClickUp-)?(\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2})', key)
        # Add other patterns as needed, e.g., specific hour ranges

        current_type = None
        current_date_str = None

        if match_weekly:
            current_type = "Weekly"
            current_date_str = match_weekly.group(1)
        elif match_daily:
            current_type = "Daily"
            current_date_str = match_daily.group(1)
        elif match_xh:
            hours = match_xh.group(1)
            current_type = f"{hours}H"
            current_date_str = match_xh.group(2)

        if current_type and current_date_str:
            type_counts[current_type] += 1
            try:
                # Convert to datetime for comparison, then back to desired string format
                dt_obj = datetime.strptime(current_date_str, '%Y-%m-%d-%H-%M-%S')
                report_dates.append(dt_obj)
            except ValueError:
                pass # Ignore if date parsing fails for a key

    # Determine the most common type
    if type_counts:
        primary_type = max(type_counts, key=type_counts.get)
        # If all keys are of this primary type, consider it consistent
        if type_counts[primary_type] == len(timestamp_keys) and report_dates:
            # Check if all dates are very close (e.g., within a small tolerance if needed)
            # For simplicity, if all are the same type, we'll use the latest date as the representative end date.
            latest_date = max(report_dates)
            end_date_str_for_title = latest_date.strftime('%Y-%m-%d %H:%M:%S')
            end_date_str_for_filename = latest_date.strftime('%Y_%m_%d_%H_%M_%S')

            filename_prefix = f"unified_summary_{primary_type}_{end_date_str_for_filename}"
            title_context = f"{primary_type} Report ending {end_date_str_for_title}"
            datasource_context = title_context # Use the same for datasources if consistent
            return filename_prefix, title_context, datasource_context

    # Fallback if not consistent or pattern not matched for all
    filename_prefix = "unified_summary_custom_range"
    title_context = "Reports from Multiple Time Periods"
    # Join the first few keys for a more descriptive datasource if diverse
    datasource_context = ", ".join(list(timestamp_groups.keys())[:3])
    if len(timestamp_groups.keys()) > 3:
        datasource_context += " and others"
    else:
        datasource_context = ", ".join(list(timestamp_groups.keys()))

    return filename_prefix, title_context, datasource_context

def process_files(
    glob_patterns,
    user_map,
    output_dir="output",
    model=GEMINI_MODEL_NAME,
    debug=False
):
    """Process all files matching the glob patterns, combining ALL timestamp groups into one summary."""
    global timestamp_groups # Make it accessible to determine_report_context fallback
    all_files = []

    # Collect all files matching the patterns
    for pattern in glob_patterns:
        matching_files = glob.glob(pattern)
        all_files.extend(matching_files)

    if not all_files:
        print(f"No files found matching patterns: {glob_patterns}")
        return

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Group files by their timestamp patterns
    timestamp_groups = defaultdict(list)

    for file_path in all_files:
        file_name = os.path.basename(file_path)

        # Extract the timestamp part from filenames like Weekly-ClickUp-2025-05-11-10-33-09-
        # or Weekly-2025-05-11-15-53-50-
        match = re.search(r'(Weekly-(?:ClickUp-)?(\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2}))', file_name)
        if match:
            timestamp_key = match.group(1)  # Full prefix like "Weekly-ClickUp-2025-05-11-10-33-09"
            timestamp_groups[timestamp_key].append(file_path)
        else:
            # If no timestamp pattern is found, use the file without grouping
            timestamp_groups[file_name].append(file_path)

    # Initialize an overall content string for ALL timestamp groups
    all_groups_content = f"# Combined Team Updates for All Periods\n\n"

    # Process each timestamp group and collect their content
    for timestamp_key, file_paths in timestamp_groups.items():
        print(f"Processing group: {timestamp_key}")

        # Add a section header for this timestamp group
        all_groups_content += f"# Team Updates for {timestamp_key}\n\n"

        # Process each file in the group
        for file_path in sorted(file_paths):
            file_name = os.path.basename(file_path)
            file_ext = os.path.splitext(file_name)[1].lower()

            print(f"  Including: {file_name}")

            try:
                # Extract data based on file type
                if file_ext == '.md':
                    with open(file_path, 'r', encoding='utf-8') as f:
                        file_content = f.read()

                    # Add section header for this file
                    all_groups_content += f"## Content from {file_name}\n\n{file_content}\n\n"

                elif file_ext == '.json':
                    json_summary = extract_json_data(file_path)
                    all_groups_content += f"## Data from {file_name}\n\n{json_summary}\n\n"

                elif file_ext == '.csv':
                    csv_summary = extract_csv_data(file_path)
                    all_groups_content += f"## Data from {file_name}\n\n{csv_summary}\n\n"

                # Add separator between files
                all_groups_content += "---\n\n"

            except Exception as e:
                print(f"  Error processing file {file_name}: {e}")

        # Add a major separator between timestamp groups
        all_groups_content += "\n\n## ==========================================\n\n"

    # Replace user mentions in the combined content first
    content_with_mentions = replace_user_mentions(all_groups_content, user_map)

    token_count = count_gemini_model_tokens(content_with_mentions, model_name=model)
    if token_count != -1:
        print(f"Total token count for Gemini model '{model}' before sending: {token_count}")
    else:
        print(f"Could not determine token count for model '{model}'. Proceeding with summarization...")

    if token_count > 150000:
        print(f"WARNING: Combined content token count ({token_count}) is very large. This may approach or exceed the model's context limit and could lead to incomplete results or errors.")

    print("Generating combined summary for ALL timestamp groups...")
    summary = summarize_with_gemini(content_with_mentions, model)

    # Determine filename and title context
    filename_base, title_text, datasource_text = determine_report_context(list(timestamp_groups.keys()))

    current_timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
    output_file = os.path.join(
        output_dir,
        f"{filename_base}_{current_timestamp}.md"
    )

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(f"# Unified Summary for {title_text}\n\n")
        f.write(f"*Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n\n")
        f.write(f"*Data sources: {datasource_text}*\n\n")
        f.write(summary)

    print(f"Unified summary for ALL groups saved to: {output_file}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Summarize ClickUp weekly reports and GitHub activity using OpenAI 4.1-mini."
    )
    parser.add_argument(
        "--user-map",
        default="user_map.yml",
        help="Path to YAML file mapping ClickUp users to Slack IDs"
    )
    parser.add_argument(
        "--output-dir",
        default="output",
        help="Directory to save summary files"
    )
    parser.add_argument(
        "--model",
        default=GEMINI_MODEL_NAME,
        help="Gemini model to use for summarization"
    )
    parser.add_argument(
        "glob_patterns",
        nargs="+",
        help="Glob patterns to match files (e.g., 'output/Weekly-ClickUp-*.md')"
    )

    args = parser.parse_args()

    # Load user map
    user_map = load_user_map(args.user_map)
    if user_map is None:
        sys.exit(1)

    # Process matching files
    process_files(args.glob_patterns, user_map, args.output_dir, args.model)