tudo-para-ia-mais-humana-pl…/src/mais_humana/scanner.py

"""Repository scanner for the human-centered platform.

The scanner is deliberately conservative.  It extracts local evidence without
executing project code, without reading secrets, and without depending on a
particular package manager.  The goal is not static analysis perfection; the
goal is repeatable operational context for human reports.
"""

from __future__ import annotations

import json
import os
import re
import subprocess
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, Iterator, Sequence

from .catalog import CATEGORY_KEYWORDS, PLATFORMS, categories_for_text
from .models import Evidence, EvidenceKind, FileMetric, PlatformDefinition, PlatformScan, ScriptCommand


SKIP_DIRS = {
    ".git",
    ".test-tmp",
    ".hg",
    ".svn",
    "node_modules",
    "dist",
    "build",
    "coverage",
    ".next",
    ".nuxt",
    ".wrangler",
    ".turbo",
    ".cache",
    "vendor",
    "__pycache__",
    ".pytest_cache",
    ".mypy_cache",
}

TEXT_EXTENSIONS = {
    ".ts",
    ".tsx",
    ".js",
    ".mjs",
    ".cjs",
    ".py",
    ".java",
    ".json",
    ".md",
    ".mdx",
    ".yml",
    ".yaml",
    ".toml",
    ".txt",
    ".sql",
    ".html",
    ".css",
    ".scss",
    ".xml",
}

CODE_EXTENSIONS = {".ts", ".tsx", ".js", ".mjs", ".cjs", ".py", ".java"}

ROUTE_PATTERNS = (
    re.compile(r"\bapp\.(get|post|put|patch|delete)\s*\(\s*['\"]([^'\"]+)", re.I),
    re.compile(r"\brouter\.(get|post|put|patch|delete)\s*\(\s*['\"]([^'\"]+)", re.I),
    re.compile(r"\bnew\s+URLPattern\s*\(\s*['\"]([^'\"]+)", re.I),
    re.compile(r"\b(path|route)\s*:\s*['\"](/[^'\"]+)['\"]", re.I),
    re.compile(r"\bfetch\s*\(\s*['\"](https?://[^'\"]+|/[^'\"]+)['\"]", re.I),
)

SENSITIVE_FILE_PARTS = (
    ".env",
    "secret",
    "secrets",
    "private",
    "key.pem",
    "id_rsa",
    "credential",
    "credentials",
)


@dataclass(slots=True)
class ScanOptions:
    """Options for local scan depth and safety."""

    max_file_bytes: int = 420_000
    max_readme_chars: int = 4_000
    max_evidence_per_kind: int = 40
    include_markdown_metrics: bool = True
    include_json_metrics: bool = True


def is_sensitive_path(path: Path) -> bool:
    lowered = str(path).lower()
    return any(part in lowered for part in SENSITIVE_FILE_PARTS)


def should_skip_dir(path: Path) -> bool:
    return path.name in SKIP_DIRS


def is_probably_text(path: Path) -> bool:
    return path.suffix.lower() in TEXT_EXTENSIONS or path.name.lower() in {"package.json", "wrangler.toml"}


def safe_relative(path: Path, base: Path) -> str:
    try:
        return str(path.relative_to(base)).replace("\\", "/")
    except ValueError:
        return str(path).replace("\\", "/")


def iter_files(root: Path) -> Iterator[Path]:
    if not root.exists():
        return
    stack = [root]
    while stack:
        current = stack.pop()
        try:
            entries = sorted(current.iterdir(), key=lambda item: item.name.lower())
        except OSError:
            continue
        for entry in entries:
            if entry.is_dir():
                if not should_skip_dir(entry):
                    stack.append(entry)
            elif entry.is_file():
                yield entry


def count_lines(path: Path, max_bytes: int) -> int:
    try:
        if path.stat().st_size > max_bytes:
            return 0
        with path.open("r", encoding="utf-8", errors="ignore") as handle:
            return sum(1 for _ in handle)
    except OSError:
        return 0


def read_text_limited(path: Path, max_bytes: int) -> str:
    try:
        if path.stat().st_size > max_bytes:
            return ""
        return path.read_text(encoding="utf-8", errors="ignore")
    except OSError:
        return ""


def run_git(repo: Path, *args: str) -> str | None:
    try:
        completed = subprocess.run(
            ["git", *args],
            cwd=str(repo),
            text=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.DEVNULL,
            timeout=12,
            check=False,
        )
    except (OSError, subprocess.SubprocessError):
        return None
    if completed.returncode != 0:
        return None
    return completed.stdout.strip() or None


def detect_git(repo: Path) -> tuple[bool, str | None, str | None, str | None]:
    git_present = (repo / ".git").exists()
    if not git_present:
        return False, None, None, None
    branch = run_git(repo, "rev-parse", "--abbrev-ref", "HEAD")
    head = run_git(repo, "rev-parse", "HEAD")
    remote = run_git(repo, "remote", "get-url", "origin")
    return True, branch, head, remote


def script_intent(name: str, command: str) -> str:
    merged = f"{name} {command}".lower()
    if any(token in merged for token in ("test", "vitest", "pytest", "jest", "node --test")):
        return "test"
    if any(token in merged for token in ("build", "tsc", "vite build", "rollup", "webpack")):
        return "build"
    if any(token in merged for token in ("smoke", "health", "readiness")):
        return "validation"
    if any(token in merged for token in ("deploy", "wrangler deploy", "pages deploy")):
        return "deploy"
    if any(token in merged for token in ("generate", "contract", "schema")):
        return "generation"
    if any(token in merged for token in ("lint", "format", "eslint", "prettier")):
        return "quality"
    return "operation"


def load_package_scripts(repo: Path) -> tuple[ScriptCommand, ...]:
    package_path = repo / "package.json"
    if not package_path.exists() or is_sensitive_path(package_path):
        return ()
    try:
        data = json.loads(package_path.read_text(encoding="utf-8", errors="ignore"))
    except (OSError, json.JSONDecodeError):
        return ()
    scripts = data.get("scripts", {})
    if not isinstance(scripts, dict):
        return ()
    output: list[ScriptCommand] = []
    for name, command in sorted(scripts.items()):
        if isinstance(command, str):
            output.append(
                ScriptCommand(
                    name=name,
                    command=command,
                    source_file="package.json",
                    intent=script_intent(name, command),
                )
            )
    return tuple(output)


def read_readme(repo: Path, max_chars: int) -> str:
    candidates = [
        repo / "README.md",
        repo / "readme.md",
        repo / "README.txt",
        repo / "docs" / "README.md",
    ]
    for candidate in candidates:
        if candidate.exists() and not is_sensitive_path(candidate):
            content = read_text_limited(candidate, max_chars * 8)
            return content[:max_chars].strip()
    return ""


def metric_for_file(path: Path, base: Path, options: ScanOptions) -> FileMetric | None:
    suffix = path.suffix.lower()
    if suffix not in TEXT_EXTENSIONS and path.name.lower() not in {"package.json", "wrangler.toml"}:
        return None
    if suffix in {".md", ".mdx"} and not options.include_markdown_metrics:
        return None
    if suffix == ".json" and not options.include_json_metrics:
        return None
    if is_sensitive_path(path):
        return None
    try:
        size = path.stat().st_size
    except OSError:
        return None
    lines = count_lines(path, options.max_file_bytes)
    return FileMetric(path=safe_relative(path, base), extension=suffix or path.name.lower(), lines=lines, bytes_size=size)


def add_limited(bucket: dict[EvidenceKind, list[Evidence]], evidence: Evidence, limit: int) -> None:
    items = bucket.setdefault(evidence.kind, [])
    if len(items) < limit:
        items.append(evidence)


def evidence_from_filename(path: Path, base: Path) -> Evidence | None:
    relative = safe_relative(path, base)
    lowered = relative.lower()
    name = path.name.lower()
    if is_sensitive_path(path):
        return None
    if name.startswith("readme"):
        return Evidence(EvidenceKind.README, relative, "Documentacao inicial encontrada.", confidence=0.75, tags=("docs",))
    if "openapi" in lowered or "swagger" in lowered:
        return Evidence(EvidenceKind.OPENAPI, relative, "Arquivo com indicio de contrato OpenAPI.", confidence=0.8)
    if "test" in lowered or "spec" in lowered:
        return Evidence(EvidenceKind.TEST, relative, "Arquivo de teste ou especificacao encontrado.", confidence=0.72)
    if "wrangler" in name or name in {"package.json", "pyproject.toml", "tsconfig.json"}:
        return Evidence(EvidenceKind.CONFIG, relative, "Configuracao operacional encontrada.", confidence=0.65)
    if "worker" in lowered or "cloudflare" in lowered:
        return Evidence(EvidenceKind.WORKER, relative, "Indicador de Worker ou Cloudflare encontrado.", confidence=0.6)
    if "screen" in lowered or "view" in lowered or "ui" in lowered:
        return Evidence(EvidenceKind.UI_SURFACE, relative, "Possivel superficie visual encontrada.", confidence=0.55)
    if "mcp" in lowered or "tool" in lowered:
        return Evidence(EvidenceKind.MCP_TOOL, relative, "Possivel tool ou superficie MCP encontrada.", confidence=0.55)
    return None


def evidence_from_text(path: Path, base: Path, text: str, limit: int) -> tuple[Evidence, ...]:
    relative = safe_relative(path, base)
    lowered = text.lower()
    output: list[Evidence] = []
    line_index: dict[str, int] = {}
    for index, line in enumerate(text.splitlines(), start=1):
        if len(line_index) > 100:
            break
        normalized = line.lower()
        for key in (
            "health",
            "readiness",
            "openapi",
            "audit",
            "trace",
            "rbac",
            "byok",
            "credentialref",
            "panelready",
            "samesource",
            "entitlement",
            "invoice",
            "incident",
            "support",
            "screen",
            "mcp",
        ):
            if key in normalized and key not in line_index:
                line_index[key] = index
    for key, line in line_index.items():
        if len(output) >= limit:
            break
        kind = kind_for_keyword(key)
        output.append(
            Evidence(
                kind=kind,
                path=relative,
                line=line,
                summary=f"Texto menciona '{key}', sinalizando capacidade humana ou operacional.",
                confidence=confidence_for_keyword(key),
                tags=tuple(category.value for category in categories_for_text(key)),
            )
        )
    for pattern in ROUTE_PATTERNS:
        for match in pattern.finditer(text):
            if len(output) >= limit:
                break
            route = match.group(match.lastindex or 1)
            if not route:
                continue
            output.append(
                Evidence(
                    kind=EvidenceKind.ROUTE,
                    path=relative,
                    summary=f"Rota ou chamada HTTP detectada: {route}",
                    confidence=0.66,
                    tags=("route",),
                )
            )
        if len(output) >= limit:
            break
    return tuple(output)


def kind_for_keyword(keyword: str) -> EvidenceKind:
    keyword = keyword.lower()
    if keyword in {"openapi"}:
        return EvidenceKind.OPENAPI
    if keyword in {"audit", "trace"}:
        return EvidenceKind.OBSERVABILITY
    if keyword in {"rbac", "credentialref", "byok"}:
        return EvidenceKind.SECURITY
    if keyword in {"panelready", "samesource", "screen"}:
        return EvidenceKind.UI_SURFACE
    if keyword in {"mcp"}:
        return EvidenceKind.MCP_TOOL
    if keyword in {"entitlement", "invoice"}:
        return EvidenceKind.BUSINESS_RULE
    if keyword in {"incident", "support"}:
        return EvidenceKind.UNKNOWN
    if keyword in {"health", "readiness"}:
        return EvidenceKind.OBSERVABILITY
    return EvidenceKind.UNKNOWN


def confidence_for_keyword(keyword: str) -> float:
    strong = {"openapi", "panelready", "samesource", "credentialref", "rbac", "byok"}
    medium = {"health", "readiness", "audit", "trace", "entitlement", "invoice"}
    if keyword.lower() in strong:
        return 0.78
    if keyword.lower() in medium:
        return 0.68
    return 0.55


def classify_warnings(scan: PlatformScan) -> tuple[str, ...]:
    warnings: list[str] = list(scan.warnings)
    if not scan.exists:
        warnings.append("repositorio real nao encontrado")
    if scan.exists and not scan.git_present:
        warnings.append("repositorio real existe sem .git")
    if scan.exists and not scan.readme_excerpt:
        warnings.append("README tecnico nao encontrado")
    if scan.exists and scan.code_lines == 0:
        warnings.append("nenhuma linha de codigo TS/JS/Python/Java encontrada")
    if scan.exists and not scan.has_tests:
        warnings.append("testes nao encontrados por varredura local")
    if scan.exists and not scan.has_openapi:
        warnings.append("contrato OpenAPI nao encontrado por varredura local")
    return tuple(dict.fromkeys(warnings))


def scan_platform(root: Path, platform: PlatformDefinition, options: ScanOptions | None = None) -> PlatformScan:
    options = options or ScanOptions()
    repo = root / platform.repo_name
    exists = repo.exists()
    git_present, branch, head, remote = detect_git(repo) if exists else (False, None, None, None)
    readme = read_readme(repo, options.max_readme_chars) if exists else ""
    metrics: list[FileMetric] = []
    evidence_bucket: dict[EvidenceKind, list[Evidence]] = {}
    warnings: list[str] = []
    scripts = load_package_scripts(repo) if exists else ()
    for script in scripts:
        add_limited(
            evidence_bucket,
            Evidence(
                EvidenceKind.PACKAGE_SCRIPT,
                script.source_file,
                f"Script '{script.name}' com intencao '{script.intent}'.",
                confidence=0.62,
                tags=(script.intent,),
            ),
            options.max_evidence_per_kind,
        )
    if exists:
        for file_path in iter_files(repo):
            metric = metric_for_file(file_path, repo, options)
            if metric is not None:
                metrics.append(metric)
            filename_evidence = evidence_from_filename(file_path, repo)
            if filename_evidence is not None:
                add_limited(evidence_bucket, filename_evidence, options.max_evidence_per_kind)
            if file_path.suffix.lower() in CODE_EXTENSIONS | {".md", ".json"} and not is_sensitive_path(file_path):
                text = read_text_limited(file_path, options.max_file_bytes)
                if text:
                    for item in evidence_from_text(file_path, repo, text, limit=6):
                        add_limited(evidence_bucket, item, options.max_evidence_per_kind)
    evidence: list[Evidence] = []
    for kind in sorted(evidence_bucket, key=lambda item: item.value):
        evidence.extend(evidence_bucket[kind])
    scan = PlatformScan(
        platform=platform,
        repo_path=str(repo),
        exists=exists,
        git_present=git_present,
        branch=branch,
        head=head,
        remote_origin=remote,
        readme_excerpt=readme,
        file_metrics=tuple(metrics),
        scripts=scripts,
        evidence=tuple(evidence),
        warnings=tuple(warnings),
    )
    return PlatformScan(
        platform=scan.platform,
        repo_path=scan.repo_path,
        exists=scan.exists,
        git_present=scan.git_present,
        branch=scan.branch,
        head=scan.head,
        remote_origin=scan.remote_origin,
        readme_excerpt=scan.readme_excerpt,
        file_metrics=scan.file_metrics,
        scripts=scan.scripts,
        evidence=scan.evidence,
        warnings=classify_warnings(scan),
        scanned_at=scan.scanned_at,
    )


def scan_ecosystem(root: Path, platforms: Sequence[PlatformDefinition] = PLATFORMS) -> tuple[PlatformScan, ...]:
    return tuple(scan_platform(root, platform) for platform in platforms)


def summarize_extensions(metrics: Iterable[FileMetric]) -> dict[str, int]:
    result: dict[str, int] = {}
    for metric in metrics:
        result[metric.extension] = result.get(metric.extension, 0) + metric.lines
    return dict(sorted(result.items(), key=lambda item: (-item[1], item[0])))


def detect_human_keywords(scan: PlatformScan) -> dict[str, int]:
    counts: dict[str, int] = {}
    for evidence in scan.evidence:
        text = f"{evidence.summary} {' '.join(evidence.tags)}".lower()
        for category, keywords in CATEGORY_KEYWORDS.items():
            if any(keyword.lower() in text for keyword in keywords):
                counts[category.value] = counts.get(category.value, 0) + 1
    return dict(sorted(counts.items(), key=lambda item: (-item[1], item[0])))


def list_candidate_roots(root: Path) -> tuple[str, ...]:
    if not root.exists():
        return ()
    output: list[str] = []
    for entry in sorted(root.iterdir(), key=lambda item: item.name.lower()):
        if entry.is_dir() and entry.name.startswith("tudo-para-ia-"):
            output.append(entry.name)
    return tuple(output)


def environment_summary(root: Path) -> dict[str, object]:
    return {
        "root": str(root),
        "root_exists": root.exists(),
        "candidate_repositories": list_candidate_roots(root),
        "platform_catalog_size": len(PLATFORMS),
        "skip_dirs": sorted(SKIP_DIRS),
    }