Les enseignants ont besoin de moyennes à jour immédiatement après la publication ou modification des notes, sans attendre un batch nocturne. Le système recalcule via Domain Events synchrones : statistiques d'évaluation (min/max/moyenne/médiane), moyennes matières pondérées (normalisation /20), et moyenne générale par élève. Les résultats sont stockés dans des tables dénormalisées avec cache Redis (TTL 5 min). Trois endpoints API exposent les données avec contrôle d'accès par rôle. Une commande console permet le backfill des données historiques au déploiement.
286 lines
10 KiB
Python
Executable File
286 lines
10 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Deterministic pre-pass for prompt craft scanner.
|
|
|
|
Extracts metrics and flagged patterns from SKILL.md and prompt files
|
|
so the LLM scanner can work from compact data instead of reading raw files.
|
|
|
|
Covers:
|
|
- SKILL.md line count and section inventory
|
|
- Overview section size
|
|
- Inline data detection (tables, fenced code blocks)
|
|
- Defensive padding pattern grep
|
|
- Meta-explanation pattern grep
|
|
- Back-reference detection ("as described above")
|
|
- Config header and progression condition presence per prompt
|
|
- File-level token estimates (chars / 4 rough approximation)
|
|
"""
|
|
|
|
# /// script
|
|
# requires-python = ">=3.9"
|
|
# ///
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
|
|
# Defensive padding / filler patterns
|
|
WASTE_PATTERNS = [
|
|
(r'\b[Mm]ake sure (?:to|you)\b', 'defensive-padding', 'Defensive: "make sure to/you"'),
|
|
(r"\b[Dd]on'?t forget (?:to|that)\b", 'defensive-padding', "Defensive: \"don't forget\""),
|
|
(r'\b[Rr]emember (?:to|that)\b', 'defensive-padding', 'Defensive: "remember to/that"'),
|
|
(r'\b[Bb]e sure to\b', 'defensive-padding', 'Defensive: "be sure to"'),
|
|
(r'\b[Pp]lease ensure\b', 'defensive-padding', 'Defensive: "please ensure"'),
|
|
(r'\b[Ii]t is important (?:to|that)\b', 'defensive-padding', 'Defensive: "it is important"'),
|
|
(r'\b[Yy]ou are an AI\b', 'meta-explanation', 'Meta: "you are an AI"'),
|
|
(r'\b[Aa]s a language model\b', 'meta-explanation', 'Meta: "as a language model"'),
|
|
(r'\b[Aa]s an AI assistant\b', 'meta-explanation', 'Meta: "as an AI assistant"'),
|
|
(r'\b[Tt]his (?:workflow|skill|process) is designed to\b', 'meta-explanation', 'Meta: "this workflow is designed to"'),
|
|
(r'\b[Tt]he purpose of this (?:section|step) is\b', 'meta-explanation', 'Meta: "the purpose of this section is"'),
|
|
(r"\b[Ll]et'?s (?:think about|begin|start)\b", 'filler', "Filler: \"let's think/begin\""),
|
|
(r'\b[Nn]ow we(?:\'ll| will)\b', 'filler', "Filler: \"now we'll\""),
|
|
]
|
|
|
|
# Back-reference patterns (self-containment risk)
|
|
BACKREF_PATTERNS = [
|
|
(r'\bas described above\b', 'Back-reference: "as described above"'),
|
|
(r'\bper the overview\b', 'Back-reference: "per the overview"'),
|
|
(r'\bas mentioned (?:above|in|earlier)\b', 'Back-reference: "as mentioned above/in/earlier"'),
|
|
(r'\bsee (?:above|the overview)\b', 'Back-reference: "see above/the overview"'),
|
|
(r'\brefer to (?:the )?(?:above|overview|SKILL)\b', 'Back-reference: "refer to above/overview"'),
|
|
]
|
|
|
|
|
|
def count_tables(content: str) -> tuple[int, int]:
|
|
"""Count markdown tables and their total lines."""
|
|
table_count = 0
|
|
table_lines = 0
|
|
in_table = False
|
|
for line in content.split('\n'):
|
|
if '|' in line and re.match(r'^\s*\|', line):
|
|
if not in_table:
|
|
table_count += 1
|
|
in_table = True
|
|
table_lines += 1
|
|
else:
|
|
in_table = False
|
|
return table_count, table_lines
|
|
|
|
|
|
def count_fenced_blocks(content: str) -> tuple[int, int]:
|
|
"""Count fenced code blocks and their total lines."""
|
|
block_count = 0
|
|
block_lines = 0
|
|
in_block = False
|
|
for line in content.split('\n'):
|
|
if line.strip().startswith('```'):
|
|
if in_block:
|
|
in_block = False
|
|
else:
|
|
in_block = True
|
|
block_count += 1
|
|
elif in_block:
|
|
block_lines += 1
|
|
return block_count, block_lines
|
|
|
|
|
|
def extract_overview_size(content: str) -> int:
|
|
"""Count lines in the ## Overview section."""
|
|
lines = content.split('\n')
|
|
in_overview = False
|
|
overview_lines = 0
|
|
for line in lines:
|
|
if re.match(r'^##\s+Overview\b', line):
|
|
in_overview = True
|
|
continue
|
|
elif in_overview and re.match(r'^##\s', line):
|
|
break
|
|
elif in_overview:
|
|
overview_lines += 1
|
|
return overview_lines
|
|
|
|
|
|
def scan_file_patterns(filepath: Path, rel_path: str) -> dict:
|
|
"""Extract metrics and pattern matches from a single file."""
|
|
content = filepath.read_text(encoding='utf-8')
|
|
lines = content.split('\n')
|
|
line_count = len(lines)
|
|
|
|
# Token estimate (rough: chars / 4)
|
|
token_estimate = len(content) // 4
|
|
|
|
# Section inventory
|
|
sections = []
|
|
for i, line in enumerate(lines, 1):
|
|
m = re.match(r'^(#{2,3})\s+(.+)$', line)
|
|
if m:
|
|
sections.append({'level': len(m.group(1)), 'title': m.group(2).strip(), 'line': i})
|
|
|
|
# Tables and code blocks
|
|
table_count, table_lines = count_tables(content)
|
|
block_count, block_lines = count_fenced_blocks(content)
|
|
|
|
# Pattern matches
|
|
waste_matches = []
|
|
for pattern, category, label in WASTE_PATTERNS:
|
|
for m in re.finditer(pattern, content):
|
|
line_num = content[:m.start()].count('\n') + 1
|
|
waste_matches.append({
|
|
'line': line_num,
|
|
'category': category,
|
|
'pattern': label,
|
|
'context': lines[line_num - 1].strip()[:100],
|
|
})
|
|
|
|
backref_matches = []
|
|
for pattern, label in BACKREF_PATTERNS:
|
|
for m in re.finditer(pattern, content, re.IGNORECASE):
|
|
line_num = content[:m.start()].count('\n') + 1
|
|
backref_matches.append({
|
|
'line': line_num,
|
|
'pattern': label,
|
|
'context': lines[line_num - 1].strip()[:100],
|
|
})
|
|
|
|
# Config header
|
|
has_config_header = '{communication_language}' in content or '{document_output_language}' in content
|
|
|
|
# Progression condition
|
|
prog_keywords = ['progress', 'advance', 'move to', 'next stage',
|
|
'when complete', 'proceed to', 'transition', 'completion criteria']
|
|
has_progression = any(kw in content.lower() for kw in prog_keywords)
|
|
|
|
result = {
|
|
'file': rel_path,
|
|
'line_count': line_count,
|
|
'token_estimate': token_estimate,
|
|
'sections': sections,
|
|
'table_count': table_count,
|
|
'table_lines': table_lines,
|
|
'fenced_block_count': block_count,
|
|
'fenced_block_lines': block_lines,
|
|
'waste_patterns': waste_matches,
|
|
'back_references': backref_matches,
|
|
'has_config_header': has_config_header,
|
|
'has_progression': has_progression,
|
|
}
|
|
|
|
return result
|
|
|
|
|
|
def scan_prompt_metrics(skill_path: Path) -> dict:
|
|
"""Extract metrics from all prompt-relevant files."""
|
|
files_data = []
|
|
|
|
# SKILL.md
|
|
skill_md = skill_path / 'SKILL.md'
|
|
if skill_md.exists():
|
|
data = scan_file_patterns(skill_md, 'SKILL.md')
|
|
content = skill_md.read_text(encoding='utf-8')
|
|
data['overview_lines'] = extract_overview_size(content)
|
|
data['is_skill_md'] = True
|
|
files_data.append(data)
|
|
|
|
# Prompt files at skill root (non-SKILL.md .md files)
|
|
for f in sorted(skill_path.iterdir()):
|
|
if f.is_file() and f.suffix == '.md' and f.name != 'SKILL.md':
|
|
data = scan_file_patterns(f, f.name)
|
|
data['is_skill_md'] = False
|
|
files_data.append(data)
|
|
|
|
# Resources (just sizes, for progressive disclosure assessment)
|
|
resources_dir = skill_path / 'resources'
|
|
resource_sizes = {}
|
|
if resources_dir.exists():
|
|
for f in sorted(resources_dir.iterdir()):
|
|
if f.is_file() and f.suffix in ('.md', '.json', '.yaml', '.yml'):
|
|
content = f.read_text(encoding='utf-8')
|
|
resource_sizes[f.name] = {
|
|
'lines': len(content.split('\n')),
|
|
'tokens': len(content) // 4,
|
|
}
|
|
|
|
# Aggregate stats
|
|
total_waste = sum(len(f['waste_patterns']) for f in files_data)
|
|
total_backrefs = sum(len(f['back_references']) for f in files_data)
|
|
total_tokens = sum(f['token_estimate'] for f in files_data)
|
|
prompts_with_config = sum(1 for f in files_data if not f.get('is_skill_md') and f['has_config_header'])
|
|
prompts_with_progression = sum(1 for f in files_data if not f.get('is_skill_md') and f['has_progression'])
|
|
total_prompts = sum(1 for f in files_data if not f.get('is_skill_md'))
|
|
|
|
skill_md_data = next((f for f in files_data if f.get('is_skill_md')), None)
|
|
|
|
return {
|
|
'scanner': 'prompt-craft-prepass',
|
|
'script': 'prepass-prompt-metrics.py',
|
|
'version': '1.0.0',
|
|
'skill_path': str(skill_path),
|
|
'timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'status': 'info',
|
|
'skill_md_summary': {
|
|
'line_count': skill_md_data['line_count'] if skill_md_data else 0,
|
|
'token_estimate': skill_md_data['token_estimate'] if skill_md_data else 0,
|
|
'overview_lines': skill_md_data.get('overview_lines', 0) if skill_md_data else 0,
|
|
'table_count': skill_md_data['table_count'] if skill_md_data else 0,
|
|
'table_lines': skill_md_data['table_lines'] if skill_md_data else 0,
|
|
'fenced_block_count': skill_md_data['fenced_block_count'] if skill_md_data else 0,
|
|
'fenced_block_lines': skill_md_data['fenced_block_lines'] if skill_md_data else 0,
|
|
'section_count': len(skill_md_data['sections']) if skill_md_data else 0,
|
|
},
|
|
'prompt_health': {
|
|
'total_prompts': total_prompts,
|
|
'prompts_with_config_header': prompts_with_config,
|
|
'prompts_with_progression': prompts_with_progression,
|
|
},
|
|
'aggregate': {
|
|
'total_files_scanned': len(files_data),
|
|
'total_token_estimate': total_tokens,
|
|
'total_waste_patterns': total_waste,
|
|
'total_back_references': total_backrefs,
|
|
},
|
|
'resource_sizes': resource_sizes,
|
|
'files': files_data,
|
|
}
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(
|
|
description='Extract prompt craft metrics for LLM scanner pre-pass',
|
|
)
|
|
parser.add_argument(
|
|
'skill_path',
|
|
type=Path,
|
|
help='Path to the skill directory to scan',
|
|
)
|
|
parser.add_argument(
|
|
'--output', '-o',
|
|
type=Path,
|
|
help='Write JSON output to file instead of stdout',
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
if not args.skill_path.is_dir():
|
|
print(f"Error: {args.skill_path} is not a directory", file=sys.stderr)
|
|
return 2
|
|
|
|
result = scan_prompt_metrics(args.skill_path)
|
|
output = json.dumps(result, indent=2)
|
|
|
|
if args.output:
|
|
args.output.parent.mkdir(parents=True, exist_ok=True)
|
|
args.output.write_text(output)
|
|
print(f"Results written to {args.output}", file=sys.stderr)
|
|
else:
|
|
print(output)
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|