From cf5bc83b7519d9eabc86141083ad4d6965120547 Mon Sep 17 00:00:00 2001 From: Shaun Arman Date: Sun, 31 May 2026 14:41:47 -0500 Subject: [PATCH] fix(ci): add post-generation evidence verification to pr-review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit qwen3-coder-next fabricates plausible-looking code in its Evidence blocks instead of quoting from the actual files provided. This adds a Python verification step that greps each fenced code block against the real changed files and tags any finding whose evidence cannot be found as UNVERIFIED. This is a safeguard, not a fix — the model is fundamentally unreliable for grounded code review. The longer-term fix is to replace qwen3-coder with a model that stays grounded to context (Claude Haiku, devstral, or deepseek-coder-v2 via the LiteLLM proxy / vLLM at 172.0.1.42). --- .gitea/workflows/pr-review.yml | 63 ++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/.gitea/workflows/pr-review.yml b/.gitea/workflows/pr-review.yml index 065600a8..b99e6f6b 100644 --- a/.gitea/workflows/pr-review.yml +++ b/.gitea/workflows/pr-review.yml @@ -144,6 +144,69 @@ jobs: echo "Review length: ${#REVIEW} chars" echo "$REVIEW" > /tmp/pr_review.txt + - name: Verify findings against codebase + if: steps.analyze.outcome == 'success' + shell: bash + run: | + set -euo pipefail + # For each finding that contains a fenced code block under "Evidence:", + # grep at least one substantial line of that block against the actual changed + # files. If nothing matches, prepend a visible UNVERIFIED tag so reviewers + # know the model fabricated the evidence. + python3 - << 'PYEOF' + import re, os + + review = open('/tmp/pr_review.txt').read() + filelist = [f.strip() for f in open('/tmp/pr_files.txt') if f.strip()] + + # Load content of every changed file + repo_text = {} + for path in filelist: + if os.path.isfile(path): + try: + repo_text[path] = open(path).read() + except Exception: + pass + + all_content = '\n'.join(repo_text.values()) + + def evidence_exists(block: str) -> bool: + """True if ≥1 significant line from the block is found verbatim in changed files.""" + for raw in block.splitlines(): + line = raw.lstrip('+-').strip() + # Skip blank, very short, pure-comment, or diff-header lines + if len(line) < 20: + continue + if line.startswith(('//','#','/*','*','Fix:','Evidence:','---','+++')): + continue + if line in all_content: + return True + return False + + # Split on finding markers; re-join after optional tagging + severity_re = re.compile(r'\[(BLOCKER|WARNING|SUGGESTION)\]') + + def tag_if_unverified(finding_text: str) -> str: + code_match = re.search(r'```[^\n]*\n(.*?)```', finding_text, re.DOTALL) + if code_match and not evidence_exists(code_match.group(1)): + # Replace first severity tag with a prefixed version + return severity_re.sub( + lambda m: f'[{m.group(1)} — ⚠️ UNVERIFIED: evidence not found in PR files]', + finding_text, count=1 + ) + return finding_text + + # Split review into preamble + individual finding blocks + # Each block starts at a severity marker line + parts = re.split(r'(?=^\[(?:BLOCKER|WARNING|SUGGESTION)\])', review, flags=re.MULTILINE) + result = parts[0] # preamble (Summary, etc.) + for block in parts[1:]: + result += tag_if_unverified(block) + + open('/tmp/pr_review.txt', 'w').write(result) + print(f"Verification complete — {len(parts)-1} finding(s) checked.") + PYEOF + - name: Post review comment if: always() && steps.context.outputs.diff_size != '0' shell: bash