From 8dc443b2b7994f1e72f62dbabc77c444dfb05d21 Mon Sep 17 00:00:00 2001 From: chengkai3 Date: Mon, 8 Jun 2026 23:39:37 +0800 Subject: [PATCH] =?UTF-8?q?[fix]:[FL-52][ATP=E6=96=87=E6=9C=AC=E8=BD=AC?= =?UTF-8?q?=E6=8D=A2=E4=B8=8E=E9=A2=84=E8=A7=88=E5=AF=BC=E5=85=A5ATP?= =?UTF-8?q?=E5=90=8E=E4=B8=AD=E6=96=87=E4=B9=B1=E7=A0=81]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: multica-agent --- memory/2026-06-08.md | 24 ++++ .../app/admin/power-lines/atp-viewer/page.tsx | 3 +- web/src/lib/text-file.test.js | 28 ++++ web/src/lib/text-file.ts | 120 ++++++++++++++++++ 4 files changed, 174 insertions(+), 1 deletion(-) create mode 100644 web/src/lib/text-file.test.js create mode 100644 web/src/lib/text-file.ts diff --git a/memory/2026-06-08.md b/memory/2026-06-08.md index da7fa17..6bbb65b 100644 --- a/memory/2026-06-08.md +++ b/memory/2026-06-08.md @@ -92,3 +92,27 @@ - 风险与关注点: - 当前本地环境仍不具备完整后端依赖,无法直接回归所有 FastAPI/SQLAlchemy 相关测试;本次验证聚焦在 Wine 探测逻辑和语法层面。 + +## Work Log - 修复 ATP 文本上传中文乱码(2026-06-08) + +- 背景: + - `ATP文本转换与预览` 页面上传 `.atp/.txt` 文件时直接使用 `file.text()`,浏览器会按 UTF-8 解码。 + - 现场 ATP 文件常见为 Windows 导出的 GBK/GB18030 文本,导致导入后编辑区和转换预览中的中文注释/名称乱码。 + +- 本次处理: + - `web/src/lib/text-file.ts` + - 新增前端文本解码工具,优先识别 BOM、兼容无 BOM 的 UTF-16,并在 UTF-8 严格解码失败时回退到 `GB18030`。 + - `web/src/app/admin/power-lines/atp-viewer/page.tsx` + - 上传 ATP 文本时改为基于 `arrayBuffer + TextDecoder` 自动判定编码,不再固定走 UTF-8。 + - `web/src/lib/text-file.test.js` + - 补充最小测试,覆盖 `UTF-8`、`GB18030(兼容 GBK)`、无 BOM `UTF-16LE` 三类输入。 + +- 验证: + - 基线:`npm_config_cache=/tmp/npm-cache npm --workspace web exec tsc --noEmit` 通过;`npm_config_cache=/tmp/npm-cache npm --workspace web exec eslint src/app/admin/power-lines/atp-viewer/page.tsx` 通过。 + - 修改后: + - `npm_config_cache=/tmp/npm-cache npm --workspace web exec tsc --noEmit` + - `npm_config_cache=/tmp/npm-cache npm --workspace web exec eslint src/app/admin/power-lines/atp-viewer/page.tsx src/lib/text-file.ts src/lib/text-file.test.js` + - `node --test web/src/lib/text-file.test.js` + +- 风险与关注点: + - 已经以错误编码写入数据库的历史 ATP 文本不会被自动修复;本次修复只覆盖后续上传与预览入口。 diff --git a/web/src/app/admin/power-lines/atp-viewer/page.tsx b/web/src/app/admin/power-lines/atp-viewer/page.tsx index f4be174..8b3206e 100644 --- a/web/src/app/admin/power-lines/atp-viewer/page.tsx +++ b/web/src/app/admin/power-lines/atp-viewer/page.tsx @@ -30,6 +30,7 @@ import { useTopicSubscription } from "@/hooks/use-topic-subscription"; import { readApiError } from "@/lib/api"; import { parseAtpTextToGraphJson, stringifyAtpGraphJson } from "@/lib/atp/parse-atp-text"; import { ATP_SAMPLE_TEXT } from "@/lib/atp/sample"; +import { readTextFile } from "@/lib/text-file"; import type { AtpGraphJson } from "@/lib/atp/types"; import type { AtpEngineStatusResponse, @@ -611,7 +612,7 @@ export default function PowerLinesAtpViewerPage() { const handleFileSelected = async (file: File) => { try { - const content = await file.text(); + const { text: content } = await readTextFile(file); setSourceText(content); versionForm.setFieldValue("atp_text", content); setParseError(""); diff --git a/web/src/lib/text-file.test.js b/web/src/lib/text-file.test.js new file mode 100644 index 0000000..113a663 --- /dev/null +++ b/web/src/lib/text-file.test.js @@ -0,0 +1,28 @@ +import assert from "node:assert/strict"; +import test from "node:test"; + +import { decodeTextBytes } from "./text-file.ts"; + +test("decodeTextBytes keeps utf-8 ATP text intact", () => { + const bytes = new TextEncoder().encode("中文ATP线路"); + const decoded = decodeTextBytes(bytes); + + assert.equal(decoded.encoding, "utf-8"); + assert.equal(decoded.text, "中文ATP线路"); +}); + +test("decodeTextBytes falls back to gb18030 for gbk ATP text", () => { + const bytes = Uint8Array.from([214, 208, 206, 196, 65, 84, 80, 207, 223, 194, 183]); + const decoded = decodeTextBytes(bytes); + + assert.equal(decoded.encoding, "gb18030"); + assert.equal(decoded.text, "中文ATP线路"); +}); + +test("decodeTextBytes detects utf-16le ATP text without bom", () => { + const bytes = Uint8Array.from(Buffer.from("ATP线路", "utf16le")); + const decoded = decodeTextBytes(bytes); + + assert.equal(decoded.encoding, "utf-16le"); + assert.equal(decoded.text, "ATP线路"); +}); diff --git a/web/src/lib/text-file.ts b/web/src/lib/text-file.ts new file mode 100644 index 0000000..33081a1 --- /dev/null +++ b/web/src/lib/text-file.ts @@ -0,0 +1,120 @@ +export type DecodedTextEncoding = "utf-8" | "utf-16le" | "utf-16be" | "gb18030"; + +export type DecodedTextFile = { + text: string; + encoding: DecodedTextEncoding; +}; + +const UTF8_BOM = Uint8Array.from([0xef, 0xbb, 0xbf]); +const UTF16_LE_BOM = Uint8Array.from([0xff, 0xfe]); +const UTF16_BE_BOM = Uint8Array.from([0xfe, 0xff]); + +function startsWithBom(bytes: Uint8Array, bom: Uint8Array): boolean { + if (bytes.length < bom.length) { + return false; + } + return bom.every((value, index) => bytes[index] === value); +} + +function tryDecode( + bytes: Uint8Array, + encoding: DecodedTextEncoding, + options?: TextDecoderOptions, +): string | null { + try { + return new TextDecoder(encoding, options).decode(bytes); + } catch { + return null; + } +} + +function detectUtf16WithoutBom(bytes: Uint8Array): DecodedTextEncoding | null { + if (bytes.length < 8) { + return null; + } + + const sampleSize = Math.min(bytes.length, 512); + let evenZeroCount = 0; + let oddZeroCount = 0; + let evenCount = 0; + let oddCount = 0; + + // ATP 文本通常以 ASCII 关键字开头,交替空字节是无 BOM UTF-16 的明显特征。 + for (let index = 0; index < sampleSize; index += 1) { + if (index % 2 === 0) { + evenCount += 1; + if (bytes[index] === 0) { + evenZeroCount += 1; + } + continue; + } + + oddCount += 1; + if (bytes[index] === 0) { + oddZeroCount += 1; + } + } + + const evenZeroRatio = evenCount === 0 ? 0 : evenZeroCount / evenCount; + const oddZeroRatio = oddCount === 0 ? 0 : oddZeroCount / oddCount; + + if (oddZeroRatio >= 0.3 && evenZeroRatio <= 0.05) { + return "utf-16le"; + } + if (evenZeroRatio >= 0.3 && oddZeroRatio <= 0.05) { + return "utf-16be"; + } + return null; +} + +export function decodeTextBytes(bytes: Uint8Array): DecodedTextFile { + if (bytes.length === 0) { + return { text: "", encoding: "utf-8" }; + } + + if (startsWithBom(bytes, UTF8_BOM)) { + return { + text: tryDecode(bytes, "utf-8") ?? "", + encoding: "utf-8", + }; + } + + if (startsWithBom(bytes, UTF16_LE_BOM)) { + return { + text: tryDecode(bytes, "utf-16le") ?? "", + encoding: "utf-16le", + }; + } + + if (startsWithBom(bytes, UTF16_BE_BOM)) { + return { + text: tryDecode(bytes, "utf-16be") ?? "", + encoding: "utf-16be", + }; + } + + const utf16Encoding = detectUtf16WithoutBom(bytes); + if (utf16Encoding) { + const utf16Text = tryDecode(bytes, utf16Encoding); + if (utf16Text !== null) { + return { text: utf16Text, encoding: utf16Encoding }; + } + } + + const utf8Text = tryDecode(bytes, "utf-8", { fatal: true }); + if (utf8Text !== null) { + return { text: utf8Text, encoding: "utf-8" }; + } + + const gb18030Text = tryDecode(bytes, "gb18030"); + if (gb18030Text !== null) { + return { text: gb18030Text, encoding: "gb18030" }; + } + + throw new Error("无法识别文件编码,请将 ATP 文本另存为 UTF-8 或 GB18030 后重试。"); +} + +export async function readTextFile(file: Blob): Promise { + const buffer = new Uint8Array(await file.arrayBuffer()); + return decodeTextBytes(buffer); +}