到目前为止发布的答案都无法正确转码包含完整 Unicode 范围内的代码点的输入字符串,例如“????????????????????????????????????????????????????????????????UnicodeSupport????est ????????????????????????????????♀️????????♂️????❤️????????????♀️」。
这就是我编写以下函数的原因,仅使用 Windows 和 MacOS 上都可用的 VBA 内置函数/语句。
此函数可跨平台和跨应用程序运行,并且适用于整个 Unicode 范围。
codePoints > 65535
即使内置 VBA,也受支持ChrW()
and AscW
不支持它们,因为转码完全是“手动”完成的,包括代理对。由于该函数在单个字节数组缓冲区上工作,因此性能也应该相对较好。如果有人发现错误或改进,请告诉我!
这段代码得到了改进,因为这个答案 https://codereview.stackexchange.com/a/284102/234277在 CodeReview 上,非常感谢克里斯蒂安·布斯 https://stackoverflow.com/users/8488913为了那个原因!
'Function transcoding an UTF-8 encoded string to the VBA-native UTF-16-LE
'Author: Guido Witt-Dörring, https://stackoverflow.com/a/75787820/12287457
' https://github.com/guwidoe/VBA-StringTools
Public Function DecodeUTF8(ByRef utf8Str As String, _
Optional ByVal raiseErrors As Boolean = False) As String
Const methodName As String = "DecodeUTF8"
Dim i As Long
Dim numBytesOfCodePoint As Byte
Static numBytesOfCodePoints(0 To 255) As Byte
Static mask(2 To 4) As Long
Static minCp(2 To 4) As Long
If numBytesOfCodePoints(0) = 0 Then
For i = &H0& To &H7F&: numBytesOfCodePoints(i) = 1: Next i '0xxxxxxx
'110xxxxx - C0 and C1 are invalid (overlong encoding)
For i = &HC2& To &HDF&: numBytesOfCodePoints(i) = 2: Next i
For i = &HE0& To &HEF&: numBytesOfCodePoints(i) = 3: Next i '1110xxxx
'11110xxx - 11110100, 11110101+ (= &HF5+) outside of valid Unicode range
For i = &HF0& To &HF4&: numBytesOfCodePoints(i) = 4: Next i
For i = 2 To 4: mask(i) = (2 ^ (7 - i) - 1): Next i
minCp(2) = &H80&: minCp(3) = &H800&: minCp(4) = &H10000
End If
Dim codepoint As Long
Dim currByte As Byte
Dim utf8() As Byte: utf8 = utf8Str
Dim utf16() As Byte: ReDim utf16(0 To (UBound(utf8) - LBound(utf8) + 1) * 2)
Dim j As Long: j = 0
Dim k As Long
i = LBound(utf8)
Do While i <= UBound(utf8)
codepoint = utf8(i)
numBytesOfCodePoint = numBytesOfCodePoints(codepoint)
If numBytesOfCodePoint = 0 Then
If raiseErrors Then Err.Raise 5, methodName, "Invalid byte"
GoTo insertErrChar
ElseIf numBytesOfCodePoint = 1 Then
utf16(j) = codepoint
j = j + 2
ElseIf i + numBytesOfCodePoint - 1 > UBound(utf8) Then
If raiseErrors Then Err.Raise 5, methodName, _
"Incomplete UTF-8 codepoint at end of string."
GoTo insertErrChar
Else
codepoint = utf8(i) And mask(numBytesOfCodePoint)
For k = 1 To numBytesOfCodePoint - 1
currByte = utf8(i + k)
If (currByte And &HC0&) = &H80& Then
codepoint = (codepoint * &H40&) + (currByte And &H3F)
Else
If raiseErrors Then _
Err.Raise 5, methodName, "Invalid continuation byte"
GoTo insertErrChar
End If
Next k
'Convert the Unicode codepoint to UTF-16LE bytes
If codepoint < minCp(numBytesOfCodePoint) Then
If raiseErrors Then Err.Raise 5, methodName, "Overlong encoding"
GoTo insertErrChar
ElseIf codepoint < &HD800& Then
utf16(j) = codepoint And &HFF&
utf16(j + 1) = codepoint \ &H100&
j = j + 2
ElseIf codepoint < &HE000& Then
If raiseErrors Then Err.Raise 5, methodName, _
"Invalid Unicode codepoint.(Range reserved for surrogate pairs)"
GoTo insertErrChar
ElseIf codepoint < &H10000 Then
If codepoint = &HFEFF& Then GoTo nextCp '(BOM - will be ignored)
utf16(j) = codepoint And &HFF&
utf16(j + 1) = codepoint \ &H100&
j = j + 2
ElseIf codepoint < &H110000 Then 'Calculate surrogate pair
Dim m As Long: m = codepoint - &H10000
Dim loSurrogate As Long: loSurrogate = &HDC00& Or (m And &H3FF)
Dim hiSurrogate As Long: hiSurrogate = &HD800& Or (m \ &H400&)
utf16(j) = hiSurrogate And &HFF&
utf16(j + 1) = hiSurrogate \ &H100&
utf16(j + 2) = loSurrogate And &HFF&
utf16(j + 3) = loSurrogate \ &H100&
j = j + 4
Else
If raiseErrors Then Err.Raise 5, methodName, _
"Codepoint outside of valid Unicode range"
insertErrChar: utf16(j) = &HFD
utf16(j + 1) = &HFF
j = j + 2
If numBytesOfCodePoint = 0 Then numBytesOfCodePoint = 1
End If
End If
nextCp: i = i + numBytesOfCodePoint 'Move to the next UTF-8 codepoint
Loop
DecodeUTF8 = MidB$(utf16, 1, j)
End Function