From d2f17d4a2dc8d7055ea90d4c30e67e2bc0b02293 Mon Sep 17 00:00:00 2001 From: jiadong Date: Mon, 20 Oct 2025 16:20:35 +0800 Subject: [PATCH] =?UTF-8?q?[208=5F11]=20=E4=BF=AE=E5=A4=8D=20unicode=20?= =?UTF-8?q?=E6=A8=A1=E5=9D=97=E8=AF=AD=E6=B3=95=E9=94=99=E8=AF=AF=E5=B9=B6?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20UTF-16LE=20=E8=BD=AC=E6=8D=A2=E5=87=BD?= =?UTF-8?q?=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- devel/208_11.md | 28 +++++++++++++++++++ devel/208_2.md | 18 ++++++++++++ goldfish/liii/base.scm | 2 +- goldfish/liii/lang.scm | 2 +- goldfish/liii/rich-string.scm | 4 +-- goldfish/liii/unicode.scm | 41 ++++++++++++++++++++++++++-- goldfish/scheme/base.scm | 6 ++-- tests/goldfish/liii/unicode-test.scm | 38 ++++++++++++++------------ 8 files changed, 112 insertions(+), 27 deletions(-) create mode 100644 devel/208_11.md diff --git a/devel/208_11.md b/devel/208_11.md new file mode 100644 index 00000000..a90c0518 --- /dev/null +++ b/devel/208_11.md @@ -0,0 +1,28 @@ +# [208_11] 重命名 ut8-string-length 为 utf8-string-length + +## 任务相关的代码文件 +- `goldfish/scheme/base.scm` - 函数定义和导出 +- `goldfish/liii/unicode.scm` - 函数导出声明 +- `goldfish/liii/base.scm` - 函数导出声明 +- `goldfish/liii/lang.scm` - 函数导入声明 +- `goldfish/liii/rich-string.scm` - 函数使用 +- `tests/goldfish/liii/unicode-test.scm` - 测试用例和文档 +- `devel/208_2.md` - 相关文档 + +## 如何测试 +一般先构建,再lint,最后运行测试用例。 + +## 2025-10-20 重命名 ut8-string-length 为 utf8-string-length + +### What +1. 将函数名 `ut8-string-length` 重命名为 `utf8-string-length` +2. 更新所有相关的函数调用和引用 +3. 确保测试用例通过 +。。。。。。。。。。。。 +### Why +函数名 `ut8-string-length` 存在拼写错误,应该是 `utf8-string-length` 以保持与 UTF-8 标准命名的一致性。 + +### How +1. 首先搜索代码库中所有使用 `ut8-string-length` 的地方 +2. 使用全局替换将函数名更新为正确的拼写 +3. 验证所有测试用例仍然正常工作 \ No newline at end of file diff --git a/devel/208_2.md b/devel/208_2.md index 5f288191..88e1421d 100644 --- a/devel/208_2.md +++ b/devel/208_2.md @@ -14,6 +14,24 @@ bin/goldfish tools/lint.scm tests/goldfish/liii/unicode-test.scm bin/goldfish tests/goldfish/liii/unicode-test.scm ``` +## 2025/10/20 utf8-string-length 重命名 +### What +将 u8-string-length 函数重命名为 utf8-string-length,以符合 UTF-8 标准命名规范。 + +1. 更新 goldfish/scheme/base.scm 中的函数定义和导出声明 +2. 更新 goldfish/liii/unicode.scm 中的导出声明 +3. 更新 goldfish/liii/base.scm 中的导出声明 +4. 更新 goldfish/liii/lang.scm 中的导入声明 +5. 更新 goldfish/liii/rich-string.scm 中的函数使用 +6. 更新 tests/goldfish/liii/unicode-test.scm 中的测试用例和文档 +7. 更新 devel/208_2.md 中的相关文档 + +### Why +统一 UTF-8 相关函数的命名规范,提高代码一致性和可读性。 + +### How +通过全局搜索和替换,系统性地更新所有相关文件中的函数引用,确保命名一致性。 + ## 2025/10/18 u8-string-length 文档 ### What 为 u8-string-length 函数添加详细的文档和测试用例。 diff --git a/goldfish/liii/base.scm b/goldfish/liii/base.scm index d5fb68e9..fbf9316d 100644 --- a/goldfish/liii/base.scm +++ b/goldfish/liii/base.scm @@ -44,7 +44,7 @@ ; R7RS 6.9 Bytevectors bytevector? make-bytevector bytevector bytevector-length bytevector-u8-ref bytevector-u8-set! bytevector-copy bytevector-append - utf8->string string->utf8 u8-string-length u8-substring bytevector-advance-utf8 + utf8->string string->utf8 utf8-string-length u8-substring bytevector-advance-utf8 ; Input and Output call-with-port port? binary-port? textual-port? input-port-open? output-port-open? open-binary-input-file open-binary-output-file close-port eof-object diff --git a/goldfish/liii/lang.scm b/goldfish/liii/lang.scm index b1e80dae..02988777 100644 --- a/goldfish/liii/lang.scm +++ b/goldfish/liii/lang.scm @@ -17,7 +17,7 @@ (define-library (liii lang) (import (only (liii base) - u8-string-length any? receive u8-substring) + utf8-string-length any? receive u8-substring) (only (liii oop) define-case-class display* @ typed-define case-class? chained-define define-object define-class chain-apply object->string) diff --git a/goldfish/liii/rich-string.scm b/goldfish/liii/rich-string.scm index fd23d994..0159ae24 100644 --- a/goldfish/liii/rich-string.scm +++ b/goldfish/liii/rich-string.scm @@ -27,7 +27,7 @@ (define-case-class rich-string ((data string?)) - (define N (u8-string-length data)) + (define N (utf8-string-length data)) (define (@empty . args) (chain-apply args (rich-string ""))) @@ -330,7 +330,7 @@ (define (%split sep) (let ((str-len N) - (sep-len (u8-string-length sep))) + (sep-len (utf8-string-length sep))) (define (split-helper start acc) (let ((next-pos (%index-of sep start))) diff --git a/goldfish/liii/unicode.scm b/goldfish/liii/unicode.scm index 4d4a8298..9723954c 100644 --- a/goldfish/liii/unicode.scm +++ b/goldfish/liii/unicode.scm @@ -17,7 +17,7 @@ (define-library (liii unicode) (export ;; UTF-8 函数 - utf8->string string->utf8 u8-string-length u8-substring bytevector-advance-utf8 + utf8->string string->utf8 utf8-string-length u8-substring bytevector-advance-utf8 codepoint->utf8 utf8->codepoint ;; UTF-16BE 函数 @@ -306,4 +306,41 @@ (else ;; 基本多文种平面字符 - 单个码元 - first-codepoint)))))) \ No newline at end of file + first-codepoint)))))) + + (define (utf8->utf16le bytevector) + (unless (bytevector? bytevector) + (error 'type-error "utf8->utf16le: expected bytevector, got" bytevector)) + + (let ((len (bytevector-length bytevector))) + (when (= len 0) + (error 'value-error "utf8->utf16le: empty bytevector")) + + (let loop ((index 0) + (result (bytevector))) + (if (>= index len) + result + (let ((codepoint (utf8->codepoint (bytevector-copy bytevector index len)))) + (let ((utf16le-bytes (codepoint->utf16le codepoint))) + (loop (bytevector-advance-utf8 bytevector index) + (bytevector-append result utf16le-bytes)))))))) + + (define (utf16le->utf8 bytevector) + (unless (bytevector? bytevector) + (error 'type-error "utf16le->utf8: expected bytevector, got" bytevector)) + + (let ((len (bytevector-length bytevector))) + (when (= len 0) + (error 'value-error "utf16le->utf8: empty bytevector")) + + (let loop ((index 0) + (result (bytevector))) + (if (>= index len) + result + (let ((codepoint (utf16le->codepoint (bytevector-copy bytevector index len)))) + (let ((utf8-bytes (codepoint->utf8 codepoint))) + (loop (+ index (if (<= codepoint #xFFFF) 2 4)) + (bytevector-append result utf8-bytes)))))) + +)) + diff --git a/goldfish/scheme/base.scm b/goldfish/scheme/base.scm index 67a99d9a..bf9bdd20 100644 --- a/goldfish/scheme/base.scm +++ b/goldfish/scheme/base.scm @@ -38,7 +38,7 @@ ; R7RS 6.9: Bytevectors bytevector? make-bytevector bytevector bytevector-length bytevector-u8-ref bytevector-u8-set! bytevector-copy bytevector-append - utf8->string string->utf8 u8-string-length bytevector-advance-utf8 + utf8->string string->utf8 utf8-string-length bytevector-advance-utf8 ; Input and Output call-with-port port? binary-port? textual-port? input-port-open? output-port-open? open-binary-input-file open-binary-output-file close-port eof-object @@ -445,7 +445,7 @@ wrong-type-arg (+ index 4))))) (else index))))) ; Invalid leading byte - (define (u8-string-length str) + (define (utf8-string-length str) (let ((bv (string->byte-vector str)) (N (string-length str))) (if (zero? N) @@ -493,7 +493,7 @@ wrong-type-arg (when (not (string? str)) (error 'type-error "str must be string")) - (let ((N (u8-string-length str))) + (let ((N (utf8-string-length str))) (when (and (> N 0) (or (< start 0) (>= start N))) (error 'out-of-range (string-append "start must >= 0 and < " (number->string N)))) diff --git a/tests/goldfish/liii/unicode-test.scm b/tests/goldfish/liii/unicode-test.scm index 9064d7ba..b73d8bd7 100644 --- a/tests/goldfish/liii/unicode-test.scm +++ b/tests/goldfish/liii/unicode-test.scm @@ -71,7 +71,7 @@ string 相关函数 -------- - `string->utf8` : 将字符串转换为 UTF-8 字节向量 -- `u8-string-length` : 获取字符串的 Unicode 字符数量 +- `utf8-string-length` : 获取字符串的 Unicode 字符数量 - `u8-substring` : 基于 Unicode 字符位置提取子字符串 |# @@ -178,7 +178,7 @@ bytevector 相关函数 -------- - `utf8->string` : 将 UTF-8 字节向量转换为字符串 -- `u8-string-length` : 获取字符串的 Unicode 字符数量 +- `utf8-string-length` : 获取字符串的 Unicode 字符数量 - `u8-substring` : 基于 Unicode 字符位置提取子字符串 |# @@ -234,12 +234,12 @@ bytevector (check (utf8->string (string->utf8 "汉字书写" 3)) => "写") #| -u8-string-length +utf8-string-length 计算 UTF-8 编码字符串的 Unicode 字符数量(码点数量)。 函数签名 ---- -(u8-string-length string) → integer +(utf8-string-length string) → integer 参数 ---- @@ -253,7 +253,7 @@ integer 描述 ---- -`u8-string-length` 用于计算 UTF-8 编码字符串中的 Unicode 字符数量,与 `string-length` 不同, +`utf8-string-length` 用于计算 UTF-8 编码字符串中的 Unicode 字符数量,与 `string-length` 不同, 它返回的是 Unicode 码点(code point)的数量,而不是字节数量。 行为特征 @@ -266,7 +266,7 @@ integer 与 string-length 的区别 ------------------- - `string-length` : 返回字符串的字节数量 -- `u8-string-length` : 返回字符串的 Unicode 字符数量 +- `utf8-string-length` : 返回字符串的 Unicode 字符数量 错误处理 @@ -287,15 +287,15 @@ integer - `string->utf8` : 将字符串转换为 UTF-8 字节向量 |# -(check (u8-string-length "") => 0) -(check (u8-string-length "Hello") => 5) -(check (u8-string-length "你好") => 2) -(check (u8-string-length "Hello 你好") => 8) -(check (u8-string-length "👍") => 1) -(check (u8-string-length "🚀") => 1) -(check (u8-string-length "🎉") => 1) -(check (u8-string-length "Hello 👍 World") => 13) -(check (u8-string-length "你好 🚀 测试") => 7) +(check (utf8-string-length "") => 0) +(check (utf8-string-length "Hello") => 5) +(check (utf8-string-length "你好") => 2) +(check (utf8-string-length "Hello 你好") => 8) +(check (utf8-string-length "👍") => 1) +(check (utf8-string-length "🚀") => 1) +(check (utf8-string-length "🎉") => 1) +(check (utf8-string-length "Hello 👍 World") => 13) +(check (utf8-string-length "你好 🚀 测试") => 7) #| u8-substring @@ -352,7 +352,7 @@ string 相关函数 -------- -- `u8-string-length` : 获取字符串的 Unicode 字符数量 +- `utf8-string-length` : 获取字符串的 Unicode 字符数量 - `string-substring` : 基于字节位置提取子字符串 - `utf8->string` : 将 UTF-8 字节向量转换为字符串 - `string->utf8` : 将字符串转换为 UTF-8 字节向量 @@ -1218,12 +1218,12 @@ UTF-8 编码规则 实现说明 ------ - 函数在 (scheme base) 库中定义,在 (liii base) 和 (liii unicode) 库中重新导出 -- 被 `u8-string-length`、`utf8->string`、`string->utf8` 等函数内部使用 +- 被 `utf8-string-length`、`utf8->string`、`string->utf8` 等函数内部使用 - 提供 UTF-8 序列验证功能 相关函数 -------- -- `u8-string-length` : 获取字符串的 Unicode 字符数量 +- `utf8-string-length` : 获取字符串的 Unicode 字符数量 - `utf8->string` : 将 UTF-8 字节向量转换为字符串 - `string->utf8` : 将字符串转换为 UTF-8 字节向量 - `u8-substring` : 基于 Unicode 字符位置提取子字符串 @@ -1287,4 +1287,6 @@ UTF-8 编码规则 (check (bytevector-advance-utf8 #u8(#x48 #x65 #x6C #x6C #x6F) 3) => 4) (check (bytevector-advance-utf8 #u8(#x48 #x65 #x6C #x6C #x6F) 4) => 5) + + (check-report) -- Gitee