From cf5bc83b7519d9eabc86141083ad4d6965120547 Mon Sep 17 00:00:00 2001
From: Shaun Arman <shaun.arman@motorolasolutions.com>
Date: Sun, 31 May 2026 14:41:47 -0500
Subject: [PATCH] fix(ci): add post-generation evidence verification to
 pr-review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

qwen3-coder-next fabricates plausible-looking code in its Evidence
blocks instead of quoting from the actual files provided. This adds a
Python verification step that greps each fenced code block against the
real changed files and tags any finding whose evidence cannot be found
as UNVERIFIED.

This is a safeguard, not a fix — the model is fundamentally unreliable
for grounded code review. The longer-term fix is to replace qwen3-coder
with a model that stays grounded to context (Claude Haiku, devstral,
or deepseek-coder-v2 via the LiteLLM proxy / vLLM at 172.0.1.42).
---
 .gitea/workflows/pr-review.yml | 63 ++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/.gitea/workflows/pr-review.yml b/.gitea/workflows/pr-review.yml
index 065600a8..b99e6f6b 100644
--- a/.gitea/workflows/pr-review.yml
+++ b/.gitea/workflows/pr-review.yml
@@ -144,6 +144,69 @@ jobs:
           echo "Review length: ${#REVIEW} chars"
           echo "$REVIEW" > /tmp/pr_review.txt
 
+      - name: Verify findings against codebase
+        if: steps.analyze.outcome == 'success'
+        shell: bash
+        run: |
+          set -euo pipefail
+          # For each finding that contains a fenced code block under "Evidence:",
+          # grep at least one substantial line of that block against the actual changed
+          # files. If nothing matches, prepend a visible UNVERIFIED tag so reviewers
+          # know the model fabricated the evidence.
+          python3 - << 'PYEOF'
+          import re, os
+
+          review   = open('/tmp/pr_review.txt').read()
+          filelist = [f.strip() for f in open('/tmp/pr_files.txt') if f.strip()]
+
+          # Load content of every changed file
+          repo_text = {}
+          for path in filelist:
+              if os.path.isfile(path):
+                  try:
+                      repo_text[path] = open(path).read()
+                  except Exception:
+                      pass
+
+          all_content = '\n'.join(repo_text.values())
+
+          def evidence_exists(block: str) -> bool:
+              """True if ≥1 significant line from the block is found verbatim in changed files."""
+              for raw in block.splitlines():
+                  line = raw.lstrip('+-').strip()
+                  # Skip blank, very short, pure-comment, or diff-header lines
+                  if len(line) < 20:
+                      continue
+                  if line.startswith(('//','#','/*','*','Fix:','Evidence:','---','+++')):
+                      continue
+                  if line in all_content:
+                      return True
+              return False
+
+          # Split on finding markers; re-join after optional tagging
+          severity_re = re.compile(r'\[(BLOCKER|WARNING|SUGGESTION)\]')
+
+          def tag_if_unverified(finding_text: str) -> str:
+              code_match = re.search(r'```[^\n]*\n(.*?)```', finding_text, re.DOTALL)
+              if code_match and not evidence_exists(code_match.group(1)):
+                  # Replace first severity tag with a prefixed version
+                  return severity_re.sub(
+                      lambda m: f'[{m.group(1)} — ⚠️ UNVERIFIED: evidence not found in PR files]',
+                      finding_text, count=1
+                  )
+              return finding_text
+
+          # Split review into preamble + individual finding blocks
+          # Each block starts at a severity marker line
+          parts  = re.split(r'(?=^\[(?:BLOCKER|WARNING|SUGGESTION)\])', review, flags=re.MULTILINE)
+          result = parts[0]                          # preamble (Summary, etc.)
+          for block in parts[1:]:
+              result += tag_if_unverified(block)
+
+          open('/tmp/pr_review.txt', 'w').write(result)
+          print(f"Verification complete — {len(parts)-1} finding(s) checked.")
+          PYEOF
+
       - name: Post review comment
         if: always() && steps.context.outputs.diff_size != '0'
         shell: bash