This repository has been archived on 2024-06-20. You can view files and clone it, but you cannot make any changes to it's state, such as pushing and creating new issues, pull requests or comments.
coffee.pygments/tests/examplefiles/julia/string.jl
jmert ffac8bde66
Update lists of operators/keywords in Julia and expand/refine highlighting (#1715)
* [julia] Update operators, keywords, and literal lists

* [julia] Support symbol macros

* [julia] Parse '..' operator juxtaposed with integers

* [julia] Identify Symbol literals

* [julia] Consume strings/commands faster & add triple-quoted command

* [julia] Support identifying operators with custom suffixes

* [julia] Add parsing for raw strings

* [julia] Share definition of interpolation

* [julia] Identify escaped ` and $ in commands

* [julia] Support non-standard string and commmand literals with flags

* [julia] Support variable names with interior exclamations

* [julia] Fix matching floats starting with decimal

* [julia] Compress nearly duplicate number matches with optional group

* [julia] Match double-underscored float literal

* [julia] Match hex float literals

* [julia] Test more non-numerical literal expressions

* [julia] Tag types in type contexts

* [julia] Identify console via `julia-repl` as well

* [julia] Be more conservative in identifying symbols

* [julia] Update example file to v1.6 `base/strings/string.jl`

* Address one CI failure

* Switch to non-emoji Unicode category So example

Hopefully fixes pypy3 CI failure

* fixup: remove duplicate operators already in DOTTED_OPERATORS_LIST

* [julia] Fix backslash operator

* [julia] List `true`, `false` with builtin names, not keywords
2021-03-05 08:31:42 +01:00

364 lines
12 KiB
Julia
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# This file is a part of Julia. License is MIT: https://julialang.org/license
"""
StringIndexError(str, i)
An error occurred when trying to access `str` at index `i` that is not valid.
"""
struct StringIndexError <: Exception
string::AbstractString
index::Integer
end
@noinline string_index_err(s::AbstractString, i::Integer) =
throw(StringIndexError(s, Int(i)))
function Base.showerror(io::IO, exc::StringIndexError)
s = exc.string
print(io, "StringIndexError: ", "invalid index [$(exc.index)]")
if firstindex(s) <= exc.index <= ncodeunits(s)
iprev = thisind(s, exc.index)
inext = nextind(s, iprev)
if inext <= ncodeunits(s)
print(io, ", valid nearby indices [$iprev]=>'$(s[iprev])', [$inext]=>'$(s[inext])'")
else
print(io, ", valid nearby index [$iprev]=>'$(s[iprev])'")
end
end
end
const ByteArray = Union{Vector{UInt8},Vector{Int8}}
@inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo b) & (b hi)
## constructors and conversions ##
# String constructor docstring from boot.jl, workaround for #16730
# and the unavailability of @doc in boot.jl context.
"""
String(v::AbstractVector{UInt8})
Create a new `String` object from a byte vector `v` containing UTF-8 encoded
characters. If `v` is `Vector{UInt8}` it will be truncated to zero length and
future modification of `v` cannot affect the contents of the resulting string.
To avoid truncation of `Vector{UInt8}` data, use `String(copy(v))`; for other
`AbstractVector` types, `String(v)` already makes a copy.
When possible, the memory of `v` will be used without copying when the `String`
object is created. This is guaranteed to be the case for byte vectors returned
by [`take!`](@ref) on a writable [`IOBuffer`](@ref) and by calls to
[`read(io, nb)`](@ref). This allows zero-copy conversion of I/O data to strings.
In other cases, `Vector{UInt8}` data may be copied, but `v` is truncated anyway
to guarantee consistent behavior.
"""
String(v::AbstractVector{UInt8}) = String(copyto!(StringVector(length(v)), v))
String(v::Vector{UInt8}) = ccall(:jl_array_to_string, Ref{String}, (Any,), v)
"""
unsafe_string(p::Ptr{UInt8}, [length::Integer])
Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8.
(The pointer can be safely freed afterwards.) If `length` is specified
(the length of the data in bytes), the string does not have to be NUL-terminated.
This function is labeled "unsafe" because it will crash if `p` is not
a valid memory address to data of the requested length.
"""
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer)
p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len)
end
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}})
p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p)
end
_string_n(n::Integer) = ccall(:jl_alloc_string, Ref{String}, (Csize_t,), n)
"""
String(s::AbstractString)
Convert a string to a contiguous byte array representation encoded as UTF-8 bytes.
This representation is often appropriate for passing strings to C.
"""
String(s::AbstractString) = print_to_string(s)
@pure String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s))
unsafe_wrap(::Type{Vector{UInt8}}, s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s)
Vector{UInt8}(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s)
Vector{UInt8}(s::String) = Vector{UInt8}(codeunits(s))
Array{UInt8}(s::String) = Vector{UInt8}(codeunits(s))
String(s::CodeUnits{UInt8,String}) = s.s
## low-level functions ##
pointer(s::String) = unsafe_convert(Ptr{UInt8}, s)
pointer(s::String, i::Integer) = pointer(s) + Int(i)::Int - 1
@pure ncodeunits(s::String) = Core.sizeof(s)
codeunit(s::String) = UInt8
@inline function codeunit(s::String, i::Integer)
@boundscheck checkbounds(s, i)
b = GC.@preserve s unsafe_load(pointer(s, i))
return b
end
## comparison ##
_memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}, len) =
ccall(:memcmp, Cint, (Ptr{UInt8}, Ptr{UInt8}, Csize_t), a, b, len % Csize_t) % Int
function cmp(a::String, b::String)
al, bl = sizeof(a), sizeof(b)
c = _memcmp(a, b, min(al,bl))
return c < 0 ? -1 : c > 0 ? +1 : cmp(al,bl)
end
function ==(a::String, b::String)
pointer_from_objref(a) == pointer_from_objref(b) && return true
al = sizeof(a)
return al == sizeof(b) && 0 == _memcmp(a, b, al)
end
typemin(::Type{String}) = ""
typemin(::String) = typemin(String)
## thisind, nextind ##
@propagate_inbounds thisind(s::String, i::Int) = _thisind_str(s, i)
# s should be String or SubString{String}
@inline function _thisind_str(s, i::Int)
i == 0 && return 0
n = ncodeunits(s)
i == n + 1 && return i
@boundscheck between(i, 1, n) || throw(BoundsError(s, i))
@inbounds b = codeunit(s, i)
(b & 0xc0 == 0x80) & (i-1 > 0) || return i
@inbounds b = codeunit(s, i-1)
between(b, 0b11000000, 0b11110111) && return i-1
(b & 0xc0 == 0x80) & (i-2 > 0) || return i
@inbounds b = codeunit(s, i-2)
between(b, 0b11100000, 0b11110111) && return i-2
(b & 0xc0 == 0x80) & (i-3 > 0) || return i
@inbounds b = codeunit(s, i-3)
between(b, 0b11110000, 0b11110111) && return i-3
return i
end
@propagate_inbounds nextind(s::String, i::Int) = _nextind_str(s, i)
# s should be String or SubString{String}
@inline function _nextind_str(s, i::Int)
i == 0 && return 1
n = ncodeunits(s)
@boundscheck between(i, 1, n) || throw(BoundsError(s, i))
@inbounds l = codeunit(s, i)
(l < 0x80) | (0xf8 l) && return i+1
if l < 0xc0
i = @inbounds thisind(s, i)
return i < i ? @inbounds(nextind(s, i)) : i+1
end
# first continuation byte
(i += 1) > n && return i
@inbounds b = codeunit(s, i)
b & 0xc0 0x80 && return i
((i += 1) > n) | (l < 0xe0) && return i
# second continuation byte
@inbounds b = codeunit(s, i)
b & 0xc0 0x80 && return i
((i += 1) > n) | (l < 0xf0) && return i
# third continuation byte
@inbounds b = codeunit(s, i)
ifelse(b & 0xc0 0x80, i, i+1)
end
## checking UTF-8 & ACSII validity ##
byte_string_classify(s::Union{String,Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) =
ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, sizeof(s))
# 0: neither valid ASCII nor UTF-8
# 1: valid ASCII
# 2: valid UTF-8
isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) = byte_string_classify(s) 0
isvalid(s::String) = isvalid(String, s)
is_valid_continuation(c) = c & 0xc0 == 0x80
## required core functionality ##
@inline function iterate(s::String, i::Int=firstindex(s))
(i % UInt) - 1 < ncodeunits(s) || return nothing
b = @inbounds codeunit(s, i)
u = UInt32(b) << 24
between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1
return iterate_continued(s, i, u)
end
function iterate_continued(s::String, i::Int, u::UInt32)
u < 0xc0000000 && (i += 1; @goto ret)
n = ncodeunits(s)
# first continuation byte
(i += 1) > n && @goto ret
@inbounds b = codeunit(s, i)
b & 0xc0 == 0x80 || @goto ret
u |= UInt32(b) << 16
# second continuation byte
((i += 1) > n) | (u < 0xe0000000) && @goto ret
@inbounds b = codeunit(s, i)
b & 0xc0 == 0x80 || @goto ret
u |= UInt32(b) << 8
# third continuation byte
((i += 1) > n) | (u < 0xf0000000) && @goto ret
@inbounds b = codeunit(s, i)
b & 0xc0 == 0x80 || @goto ret
u |= UInt32(b); i += 1
@label ret
return reinterpret(Char, u), i
end
@propagate_inbounds function getindex(s::String, i::Int)
b = codeunit(s, i)
u = UInt32(b) << 24
between(b, 0x80, 0xf7) || return reinterpret(Char, u)
return getindex_continued(s, i, u)
end
function getindex_continued(s::String, i::Int, u::UInt32)
if u < 0xc0000000
# called from `getindex` which checks bounds
@inbounds isvalid(s, i) && @goto ret
string_index_err(s, i)
end
n = ncodeunits(s)
(i += 1) > n && @goto ret
@inbounds b = codeunit(s, i) # cont byte 1
b & 0xc0 == 0x80 || @goto ret
u |= UInt32(b) << 16
((i += 1) > n) | (u < 0xe0000000) && @goto ret
@inbounds b = codeunit(s, i) # cont byte 2
b & 0xc0 == 0x80 || @goto ret
u |= UInt32(b) << 8
((i += 1) > n) | (u < 0xf0000000) && @goto ret
@inbounds b = codeunit(s, i) # cont byte 3
b & 0xc0 == 0x80 || @goto ret
u |= UInt32(b)
@label ret
return reinterpret(Char, u)
end
getindex(s::String, r::UnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))]
@inline function getindex(s::String, r::UnitRange{Int})
isempty(r) && return ""
i, j = first(r), last(r)
@boundscheck begin
checkbounds(s, r)
@inbounds isvalid(s, i) || string_index_err(s, i)
@inbounds isvalid(s, j) || string_index_err(s, j)
end
j = nextind(s, j) - 1
n = j - i + 1
ss = _string_n(n)
GC.@preserve s ss unsafe_copyto!(pointer(ss), pointer(s, i), n)
return ss
end
length(s::String) = length_continued(s, 1, ncodeunits(s), ncodeunits(s))
@inline function length(s::String, i::Int, j::Int)
@boundscheck begin
0 < i ncodeunits(s)+1 || throw(BoundsError(s, i))
0  j < ncodeunits(s)+1 || throw(BoundsError(s, j))
end
j < i && return 0
@inbounds i, k = thisind(s, i), i
c = j - i + (i == k)
length_continued(s, i, j, c)
end
@inline function length_continued(s::String, i::Int, n::Int, c::Int)
i < n || return c
@inbounds b = codeunit(s, i)
@inbounds while true
while true
(i += 1)  n || return c
0xc0  b  0xf7 && break
b = codeunit(s, i)
end
l = b
b = codeunit(s, i) # cont byte 1
c -= (x = b & 0xc0 == 0x80)
x & (l 0xe0) || continue
(i += 1)  n || return c
b = codeunit(s, i) # cont byte 2
c -= (x = b & 0xc0 == 0x80)
x & (l 0xf0) || continue
(i += 1)  n || return c
b = codeunit(s, i) # cont byte 3
c -= (b & 0xc0 == 0x80)
end
end
## overload methods for efficiency ##
isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i
function isascii(s::String)
@inbounds for i = 1:ncodeunits(s)
codeunit(s, i) >= 0x80 && return false
end
return true
end
"""
repeat(c::AbstractChar, r::Integer) -> String
Repeat a character `r` times. This can equivalently be accomplished by calling
[`c^r`](@ref :^(::Union{AbstractString, AbstractChar}, ::Integer)).
# Examples
```jldoctest
julia> repeat('A', 3)
"AAA"
```
"""
repeat(c::AbstractChar, r::Integer) = repeat(Char(c), r) # fallback
function repeat(c::Char, r::Integer)
r == 0 && return ""
r < 0 && throw(ArgumentError("can't repeat a character $r times"))
u = bswap(reinterpret(UInt32, c))
n = 4 - (leading_zeros(u | 0xff) >> 3)
s = _string_n(n*r)
p = pointer(s)
GC.@preserve s if n == 1
ccall(:memset, Ptr{Cvoid}, (Ptr{UInt8}, Cint, Csize_t), p, u % UInt8, r)
elseif n == 2
p16 = reinterpret(Ptr{UInt16}, p)
for i = 1:r
unsafe_store!(p16, u % UInt16, i)
end
elseif n == 3
b1 = (u >> 0) % UInt8
b2 = (u >> 8) % UInt8
b3 = (u >> 16) % UInt8
for i = 0:r-1
unsafe_store!(p, b1, 3i + 1)
unsafe_store!(p, b2, 3i + 2)
unsafe_store!(p, b3, 3i + 3)
end
elseif n == 4
p32 = reinterpret(Ptr{UInt32}, p)
for i = 1:r
unsafe_store!(p32, u, i)
end
end
return s
end