#!/usr/bin/env python3 """ ID: OPS-023 Summarization pipeline for codebase trace descriptions. Traverses configured directories, preprocesses code files, summarizes each file using a transformer model, handles short files via batching, and outputs a validated JSON summary to `trace_description_intermediate.json`. This version uses token-based max_length to prevent "index out of range" errors. """ import json import re from pathlib import Path from transformers import pipeline, BartTokenizer # ------------------------- # Configuration # ------------------------- CODE_DIRS = [ "api/src", "api/tests/unit", "scripts", "Gonk/GonkUI/views" ] OUTPUT_FILE = "trace_description_intermediate.json" # Minimum tokens for summarizer; files shorter than this are batched MIN_TOKENS = 10 # Fraction of input length to use for max_length MAX_LENGTH_RATIO = 0.6 # Threshold for batching short files BATCH_THRESHOLD = 20 # Initialize summarizer and tokenizer summarizer = pipeline("summarization", model="facebook/bart-large-cnn") tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn") # ------------------------- # Helper Functions # ------------------------- def preprocess_code(code: str) -> str: """Remove comments, docstrings, and excess whitespace from code.""" code = re.sub(r"#.*", "", code) code = re.sub(r'""".*?"""', "", code, flags=re.DOTALL) code = re.sub(r"\n\s*\n", "\n", code) return code.strip() def safe_summarize(text: str) -> str | None: """ Summarize text with dynamic token-based max_length. Returns None if summarization fails or text is too short. """ # Tokenize to get actual model token count inputs = tokenizer(text, return_tensors="pt") input_len = inputs.input_ids.size(1) if input_len < MIN_TOKENS: return None max_len = max(MIN_TOKENS, int(input_len * MAX_LENGTH_RATIO)) try: result = summarizer(text, max_length=max_len, min_length=5, do_sample=False) return result[0]["summary_text"] except Exception as e: print(f"[WARN] Failed to summarize text: {e}") return None def gather_files() -> list[Path]: """Recursively gather all Python files from configured directories.""" files = [] for directory in CODE_DIRS: path = Path(directory) if path.exists(): files.extend([f for f in path.rglob("*.py") if f.is_file()]) return files # ------------------------- # Main Pipeline # ------------------------- def main(): files = gather_files() summaries = {} short_batch = [] # Process each file for file in files: try: content = preprocess_code(file.read_text(encoding="utf-8")) except Exception as e: print(f"[WARN] Failed to read {file}: {e}") continue if len(content.split()) < BATCH_THRESHOLD: short_batch.append((file, content)) continue summary = safe_summarize(content) if summary: summaries[str(file)] = summary else: print(f"[WARN] Skipping {file} due to summarization failure") # Handle batched short files if short_batch: combined_text = "\n\n".join(c for f, c in short_batch) combined_summary = safe_summarize(combined_text) if combined_summary: for file, _ in short_batch: summaries[str(file)] = combined_summary else: for file, _ in short_batch: print(f"[WARN] Skipping short file {file} due to summarization failure") # Write JSON output try: with open(OUTPUT_FILE, "w", encoding="utf-8") as f: json.dump(summaries, f, indent=2) print(f"[INFO] Trace summaries written to {OUTPUT_FILE} ({len(summaries)} files summarized)") except Exception as e: print(f"[ERROR] Failed to write JSON: {e}") if __name__ == "__main__": main()