tftsr-devops_investigation/node_modules/parse5/dist/tokenizer/preprocessor.js

197 lines
6.9 KiB
JavaScript
Raw Normal View History

feat: initial implementation of TFTSR IT Triage & RCA application Implements Phases 1-8 of the TFTSR implementation plan. Rust backend (Tauri 2.x, src-tauri/): - Multi-provider AI: OpenAI-compatible, Anthropic, Gemini, Mistral, Ollama - PII detection engine: 11 regex patterns with overlap resolution - SQLCipher AES-256 encrypted database with 10 versioned migrations - 28 Tauri IPC commands for triage, analysis, document, and system ops - Ollama: hardware probe, model recommendations, pull/delete with events - RCA and blameless post-mortem Markdown document generators - PDF export via printpdf - Audit log: SHA-256 hash of every external data send - Integration stubs for Confluence, ServiceNow, Azure DevOps (v0.2) Frontend (React 18 + TypeScript + Vite, src/): - 9 pages: full triage workflow NewIssue→LogUpload→Triage→Resolution→RCA→Postmortem→History+Settings - 7 components: ChatWindow, TriageProgress, PiiDiffViewer, DocEditor, HardwareReport, ModelSelector, UI primitives - 3 Zustand stores: session, settings (persisted), history - Type-safe tauriCommands.ts matching Rust backend types exactly - 8 IT domain system prompts (Linux, Windows, Network, K8s, DB, Virt, HW, Obs) DevOps: - .woodpecker/test.yml: rustfmt, clippy, cargo test, tsc, vitest on every push - .woodpecker/release.yml: linux/amd64 + linux/arm64 builds, Gogs release upload Verified: - cargo check: zero errors - tsc --noEmit: zero errors - vitest run: 13/13 unit tests passing Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
2026-03-15 03:36:25 +00:00
import { CODE_POINTS as $, getSurrogatePairCodePoint, isControlCodePoint, isSurrogate, isSurrogatePair, isUndefinedCodePoint, } from '../common/unicode.js';
import { ERR } from '../common/error-codes.js';
//Const
const DEFAULT_BUFFER_WATERLINE = 1 << 16;
//Preprocessor
//NOTE: HTML input preprocessing
//(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream)
export class Preprocessor {
constructor(handler) {
this.handler = handler;
this.html = '';
this.pos = -1;
// NOTE: Initial `lastGapPos` is -2, to ensure `col` on initialisation is 0
this.lastGapPos = -2;
this.gapStack = [];
this.skipNextNewLine = false;
this.lastChunkWritten = false;
this.endOfChunkHit = false;
this.bufferWaterline = DEFAULT_BUFFER_WATERLINE;
this.isEol = false;
this.lineStartPos = 0;
this.droppedBufferSize = 0;
this.line = 1;
//NOTE: avoid reporting errors twice on advance/retreat
this.lastErrOffset = -1;
}
/** The column on the current line. If we just saw a gap (eg. a surrogate pair), return the index before. */
get col() {
return this.pos - this.lineStartPos + Number(this.lastGapPos !== this.pos);
}
get offset() {
return this.droppedBufferSize + this.pos;
}
getError(code, cpOffset) {
const { line, col, offset } = this;
const startCol = col + cpOffset;
const startOffset = offset + cpOffset;
return {
code,
startLine: line,
endLine: line,
startCol,
endCol: startCol,
startOffset,
endOffset: startOffset,
};
}
_err(code) {
if (this.handler.onParseError && this.lastErrOffset !== this.offset) {
this.lastErrOffset = this.offset;
this.handler.onParseError(this.getError(code, 0));
}
}
_addGap() {
this.gapStack.push(this.lastGapPos);
this.lastGapPos = this.pos;
}
_processSurrogate(cp) {
//NOTE: try to peek a surrogate pair
if (this.pos !== this.html.length - 1) {
const nextCp = this.html.charCodeAt(this.pos + 1);
if (isSurrogatePair(nextCp)) {
//NOTE: we have a surrogate pair. Peek pair character and recalculate code point.
this.pos++;
//NOTE: add a gap that should be avoided during retreat
this._addGap();
return getSurrogatePairCodePoint(cp, nextCp);
}
}
//NOTE: we are at the end of a chunk, therefore we can't infer the surrogate pair yet.
else if (!this.lastChunkWritten) {
this.endOfChunkHit = true;
return $.EOF;
}
//NOTE: isolated surrogate
this._err(ERR.surrogateInInputStream);
return cp;
}
willDropParsedChunk() {
return this.pos > this.bufferWaterline;
}
dropParsedChunk() {
if (this.willDropParsedChunk()) {
this.html = this.html.substring(this.pos);
this.lineStartPos -= this.pos;
this.droppedBufferSize += this.pos;
this.pos = 0;
this.lastGapPos = -2;
this.gapStack.length = 0;
}
}
write(chunk, isLastChunk) {
if (this.html.length > 0) {
this.html += chunk;
}
else {
this.html = chunk;
}
this.endOfChunkHit = false;
this.lastChunkWritten = isLastChunk;
}
insertHtmlAtCurrentPos(chunk) {
this.html = this.html.substring(0, this.pos + 1) + chunk + this.html.substring(this.pos + 1);
this.endOfChunkHit = false;
}
startsWith(pattern, caseSensitive) {
// Check if our buffer has enough characters
if (this.pos + pattern.length > this.html.length) {
this.endOfChunkHit = !this.lastChunkWritten;
return false;
}
if (caseSensitive) {
return this.html.startsWith(pattern, this.pos);
}
for (let i = 0; i < pattern.length; i++) {
const cp = this.html.charCodeAt(this.pos + i) | 0x20;
if (cp !== pattern.charCodeAt(i)) {
return false;
}
}
return true;
}
peek(offset) {
const pos = this.pos + offset;
if (pos >= this.html.length) {
this.endOfChunkHit = !this.lastChunkWritten;
return $.EOF;
}
const code = this.html.charCodeAt(pos);
return code === $.CARRIAGE_RETURN ? $.LINE_FEED : code;
}
advance() {
this.pos++;
//NOTE: LF should be in the last column of the line
if (this.isEol) {
this.isEol = false;
this.line++;
this.lineStartPos = this.pos;
}
if (this.pos >= this.html.length) {
this.endOfChunkHit = !this.lastChunkWritten;
return $.EOF;
}
let cp = this.html.charCodeAt(this.pos);
//NOTE: all U+000D CARRIAGE RETURN (CR) characters must be converted to U+000A LINE FEED (LF) characters
if (cp === $.CARRIAGE_RETURN) {
this.isEol = true;
this.skipNextNewLine = true;
return $.LINE_FEED;
}
//NOTE: any U+000A LINE FEED (LF) characters that immediately follow a U+000D CARRIAGE RETURN (CR) character
//must be ignored.
if (cp === $.LINE_FEED) {
this.isEol = true;
if (this.skipNextNewLine) {
// `line` will be bumped again in the recursive call.
this.line--;
this.skipNextNewLine = false;
this._addGap();
return this.advance();
}
}
this.skipNextNewLine = false;
if (isSurrogate(cp)) {
cp = this._processSurrogate(cp);
}
//OPTIMIZATION: first check if code point is in the common allowed
//range (ASCII alphanumeric, whitespaces, big chunk of BMP)
//before going into detailed performance cost validation.
const isCommonValidRange = this.handler.onParseError === null ||
(cp > 0x1f && cp < 0x7f) ||
cp === $.LINE_FEED ||
cp === $.CARRIAGE_RETURN ||
(cp > 0x9f && cp < 64976);
if (!isCommonValidRange) {
this._checkForProblematicCharacters(cp);
}
return cp;
}
_checkForProblematicCharacters(cp) {
if (isControlCodePoint(cp)) {
this._err(ERR.controlCharacterInInputStream);
}
else if (isUndefinedCodePoint(cp)) {
this._err(ERR.noncharacterInInputStream);
}
}
retreat(count) {
this.pos -= count;
while (this.pos < this.lastGapPos) {
this.lastGapPos = this.gapStack.pop();
this.pos--;
}
this.isEol = false;
}
}