-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsimdutf.go
More file actions
125 lines (112 loc) · 3.61 KB
/
simdutf.go
File metadata and controls
125 lines (112 loc) · 3.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
// Package simdutf is a Go wrapper around the [simdutf] Unicode validation and
// transcoding library.
//
// [simdutf]: https://github.com/simdutf/simdutf/
package simdutf
// #cgo CFLAGS: -O2 -g
//
// #cgo noescape validate_ascii
// #cgo nocallback validate_ascii
// #cgo noescape validate_utf8
// #cgo nocallback validate_utf8
//
// #include "simdutf_ext.h"
import "C"
import (
"sync"
"unicode/utf8"
"unsafe"
)
var versionOnce = sync.OnceValue(func() string {
return C.GoString(C.simdutf_version())
})
// Version returns the version of the linked simdutf library.
func Version() string {
return versionOnce()
}
// IsASCII reports whether p consists entirely of valid ASCII-encoded runes.
func IsASCII(p []byte) bool {
if len(p) <= cutoffIsASCII {
return isASCII(p)
}
return validateASCII(&p[0], len(p))
}
// IsASCIIString reports whether s consists entirely of valid ASCII-encoded runes.
func IsASCIIString(s string) bool {
if len(s) <= cutoffIsASCII {
return isASCIIString(s)
}
return validateASCII(unsafe.StringData(s), len(s))
}
// Valid reports whether p consists entirely of valid UTF-8-encoded runes.
func Valid(p []byte) bool {
if len(p) <= cutoffValid {
return utf8.Valid(p)
}
return validateUTF8(&p[0], len(p))
}
// ValidString reports whether s consists entirely of valid UTF-8-encoded runes.
func ValidString(s string) bool {
if len(s) <= cutoffValid {
return utf8.ValidString(s)
}
return validateUTF8(unsafe.StringData(s), len(s))
}
// isASCII is a pure Go implementation of IsASCII and is used when the input
// too small to justify the overhead of using the simdutf library.
func isASCII(p []byte) bool {
// This optimization avoids the need to recompute the capacity
// when generating code for p[8:], bringing it to parity with
// ValidString, which was 20% faster on long ASCII strings.
p = p[:len(p):len(p)]
// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
for len(p) >= 8 {
// Combining two 32 bit loads allows the same code to be used
// for 32 and 64 bit platforms.
// The compiler can generate a 32bit load for first32 and second32
// on many platforms. See test/codegen/memcombine.go.
first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
if (first32|second32)&0x80808080 != 0 {
// Found a non ASCII byte (>= RuneSelf).
break
}
p = p[8:]
}
for i := 0; i < len(p); i++ {
if p[i]&utf8.RuneSelf != 0 {
return false
}
}
return true
}
// isASCIIString is a pure Go implementation of IsASCII and is used when the input
// too small to justify the overhead of using the simdutf library.
func isASCIIString(s string) bool {
// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
for len(s) >= 8 {
// Combining two 32 bit loads allows the same code to be used
// for 32 and 64 bit platforms.
// The compiler can generate a 32bit load for first32 and second32
// on many platforms. See test/codegen/memcombine.go.
first32 := uint32(s[0]) | uint32(s[1])<<8 | uint32(s[2])<<16 | uint32(s[3])<<24
second32 := uint32(s[4]) | uint32(s[5])<<8 | uint32(s[6])<<16 | uint32(s[7])<<24
if (first32|second32)&0x80808080 != 0 {
// Found a non ASCII byte (>= RuneSelf).
return false
}
s = s[8:]
}
for i := 0; i < len(s); i++ {
if s[i]&utf8.RuneSelf != 0 {
return false
}
}
return true
}
func validateASCII(p *byte, n int) bool {
return bool(C.validate_ascii((*C.char)(unsafe.Pointer(p)), C.size_t(n)))
}
func validateUTF8(p *byte, n int) bool {
return bool(C.validate_utf8((*C.char)(unsafe.Pointer(p)), C.size_t(n)))
}