simdutf/simdutf.go at master · charlievieth/simdutf · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
// Package simdutf is a Go wrapper around the [simdutf] Unicode validation and
// transcoding library.
//
// [simdutf]: https://github.com/simdutf/simdutf/
package simdutf

// #cgo CFLAGS: -O2 -g
//
// #cgo noescape validate_ascii
// #cgo nocallback validate_ascii
// #cgo noescape validate_utf8
// #cgo nocallback validate_utf8
//
// #include "simdutf_ext.h"
import "C"

import (
	"sync"
	"unicode/utf8"
	"unsafe"
)

var versionOnce = sync.OnceValue(func() string {
	return C.GoString(C.simdutf_version())
})

// Version returns the version of the linked simdutf library.
func Version() string {
	return versionOnce()
}

// IsASCII reports whether p consists entirely of valid ASCII-encoded runes.
func IsASCII(p []byte) bool {
	if len(p) <= cutoffIsASCII {
		return isASCII(p)
	}
	return validateASCII(&p[0], len(p))
}

// IsASCIIString reports whether s consists entirely of valid ASCII-encoded runes.
func IsASCIIString(s string) bool {
	if len(s) <= cutoffIsASCII {
		return isASCIIString(s)
	}
	return validateASCII(unsafe.StringData(s), len(s))
}

// Valid reports whether p consists entirely of valid UTF-8-encoded runes.
func Valid(p []byte) bool {
	if len(p) <= cutoffValid {
		return utf8.Valid(p)
	}
	return validateUTF8(&p[0], len(p))
}

// ValidString reports whether s consists entirely of valid UTF-8-encoded runes.
func ValidString(s string) bool {
	if len(s) <= cutoffValid {
		return utf8.ValidString(s)
	}
	return validateUTF8(unsafe.StringData(s), len(s))
}

// isASCII is a pure Go implementation of IsASCII and is used when the input
// too small to justify the overhead of using the simdutf library.
func isASCII(p []byte) bool {
	// This optimization avoids the need to recompute the capacity
	// when generating code for p[8:], bringing it to parity with
	// ValidString, which was 20% faster on long ASCII strings.
	p = p[:len(p):len(p)]

	// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
	for len(p) >= 8 {
		// Combining two 32 bit loads allows the same code to be used
		// for 32 and 64 bit platforms.
		// The compiler can generate a 32bit load for first32 and second32
		// on many platforms. See test/codegen/memcombine.go.
		first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
		second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
		if (first32|second32)&0x80808080 != 0 {
			// Found a non ASCII byte (>= RuneSelf).
			break
		}
		p = p[8:]
	}
	for i := 0; i < len(p); i++ {
		if p[i]&utf8.RuneSelf != 0 {
			return false
		}
	}
	return true
}

// isASCIIString is a pure Go implementation of IsASCII and is used when the input
// too small to justify the overhead of using the simdutf library.
func isASCIIString(s string) bool {
	// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
	for len(s) >= 8 {
		// Combining two 32 bit loads allows the same code to be used
		// for 32 and 64 bit platforms.
		// The compiler can generate a 32bit load for first32 and second32
		// on many platforms. See test/codegen/memcombine.go.
		first32 := uint32(s[0]) | uint32(s[1])<<8 | uint32(s[2])<<16 | uint32(s[3])<<24
		second32 := uint32(s[4]) | uint32(s[5])<<8 | uint32(s[6])<<16 | uint32(s[7])<<24
		if (first32|second32)&0x80808080 != 0 {
			// Found a non ASCII byte (>= RuneSelf).
			return false
		}
		s = s[8:]
	}
	for i := 0; i < len(s); i++ {
		if s[i]&utf8.RuneSelf != 0 {
			return false
		}
	}
	return true
}

func validateASCII(p *byte, n int) bool {
	return bool(C.validate_ascii((*C.char)(unsafe.Pointer(p)), C.size_t(n)))
}

func validateUTF8(p *byte, n int) bool {
	return bool(C.validate_utf8((*C.char)(unsafe.Pointer(p)), C.size_t(n)))
}