Documentation

theorem Char.utf8Size_eq_three_iff {c : Char} :

c.utf8Size = 3 ↔ 2047 < c.val ∧ c.val ≤ 65535

theorem Char.utf8Size_eq_four_iff {c : Char} :

c.utf8Size = 4 ↔ 65535 < c.val

theorem Char.utf8Size_pos (c : Char) :

0 < c.utf8Size

theorem Char.utf8Size_le_four (c : Char) :

c.utf8Size ≤ 4

theorem Char.utf8Size_eq (c : Char) :

c.utf8Size = 1 ∨ c.utf8Size = 2 ∨ c.utf8Size = 3 ∨ c.utf8Size = 4

`utf8EncodeChar` #

`utf8EncodeChar` low-level API #

def String.utf8EncodeCharFast (c : Char) :

List UInt8

Returns the sequence of bytes in a character's UTF-8 encoding.

Instances For

utf8EncodeChar = utf8EncodeCharFast

@[csimp]

theorem String.utf8EncodeChar_eq_utf8EncodeCharFast :

@[simp]

theorem String.length_utf8EncodeChar (c : Char) :

(utf8EncodeChar c).length = c.utf8Size

theorem String.utf8EncodeChar_eq_singleton {c : Char} :

c.utf8Size = 1 → utf8EncodeChar c = [c.val.toUInt8]

theorem String.utf8EncodeChar_eq_cons_cons {c : Char} :

c.utf8Size = 2 → utf8EncodeChar c = [(c.val >>> 6).toUInt8 &&& 31 ||| 192, c.val.toUInt8 &&& 63 ||| 128]

theorem String.utf8EncodeChar_eq_cons_cons_cons {c : Char} :

c.utf8Size = 3 → utf8EncodeChar c = [(c.val >>> 12).toUInt8 &&& 15 ||| 224, (c.val >>> 6).toUInt8 &&& 63 ||| 128, c.val.toUInt8 &&& 63 ||| 128]

theorem String.utf8EncodeChar_eq_cons_cons_cons_cons {c : Char} :

c.utf8Size = 4 → utf8EncodeChar c = [(c.val >>> 18).toUInt8 &&& 7 ||| 240, (c.val >>> 12).toUInt8 &&& 63 ||| 128, (c.val >>> 6).toUInt8 &&& 63 ||| 128, c.val.toUInt8 &&& 63 ||| 128]

`utf8EncodeChar` BitVec API #

Size one #

Size two #

Size three #

Size four #

`parseFirstByte` #

`parseFirstByte` definition #

inductive ByteArray.utf8DecodeChar?.FirstByte :

Type

invalid : FirstByte
done : FirstByte
oneMore : FirstByte
twoMore : FirstByte
threeMore : FirstByte

Instances For

@[inline]

def ByteArray.utf8DecodeChar?.parseFirstByte (b : UInt8) :

FirstByte

Instances For

`parseFirstByte` low-level API #

`parseFirstByte` BitVec API #

Size one #

Size two #

Size three #

Size four #

`isInvalidContinuationByte` definition & API #

@[inline]

def ByteArray.utf8DecodeChar?.isInvalidContinuationByte (b : UInt8) :

Instances For

`parseFirstByte`, `isInvalidContinuationByte` and `utf8EncodeChar` #

`assemble₁` #

@[inline]

def ByteArray.utf8DecodeChar?.assemble₁ (w : UInt8) (h : parseFirstByte w = FirstByte.done) :

Instances For

@[inline]

def ByteArray.utf8DecodeChar?.verify₁ {w : UInt8} (_w : UInt8) (_h : parseFirstByte w = FirstByte.done) :

Instances For

`assemble₂` #

@[inline]

def ByteArray.utf8DecodeChar?.assemble₂Unchecked (w x : UInt8) :

UInt32

Instances For

@[inline]

def ByteArray.utf8DecodeChar?.assemble₂ (w x : UInt8) :

Instances For

@[inline]

def ByteArray.utf8DecodeChar?.verify₂ (w x : UInt8) :

Instances For

`assemble₃` #

@[inline]

def ByteArray.utf8DecodeChar?.assemble₃Unchecked (w x y : UInt8) :

UInt32

Instances For

@[inline]

def ByteArray.utf8DecodeChar?.assemble₃ (w x y : UInt8) :

Instances For

@[inline]

def ByteArray.utf8DecodeChar?.verify₃ (w x y : UInt8) :

Instances For

`assemble₄` #

@[inline]

def ByteArray.utf8DecodeChar?.assemble₄Unchecked (w x y z : UInt8) :

UInt32

Instances For

@[inline]

def ByteArray.utf8DecodeChar?.assemble₄ (w x y z : UInt8) :

Instances For

@[inline]

def ByteArray.utf8DecodeChar?.verify₄ (w x y z : UInt8) :

Instances For

@[inline]

def ByteArray.utf8DecodeChar? (bytes : ByteArray) (i : Nat) :

Decodes and returns the Char whose UTF-8 encoding begins at i in bytes.

Returns none if i is not the start of a valid UTF-8 encoding of a character.

Instances For

@[inline]

def ByteArray.validateUTF8At (bytes : ByteArray) (i : Nat) :

Instances For

`utf8DecodeChar?` low-level API #

Main theorems #

utf8DecodeChar?_utf8EncodeChar_append and toByteArray_of_utf8DecodeChar?_eq_some are the two main results that together imply that UTF-8 encoding and decoding are inverse.

theorem ByteArray.utf8DecodeChar?_utf8EncodeChar_append {b : ByteArray} {c : Char} :

((String.utf8EncodeChar c).toByteArray ++ b).utf8DecodeChar? 0 = some c

theorem String.toByteArray_utf8EncodeChar_of_utf8DecodeChar?_eq_some {b : ByteArray} {c : Char} (h : b.utf8DecodeChar? 0 = some c) :

(utf8EncodeChar c).toByteArray = b.extract 0 c.utf8Size

theorem ByteArray.validateUTF8At_eq_isSome_utf8DecodeChar? {b : ByteArray} {i : Nat} :

b.validateUTF8At i = (b.utf8DecodeChar? i).isSome

Corollaries #

theorem ByteArray.eq_of_utf8DecodeChar?_eq_some {b : ByteArray} {c : Char} (h : b.utf8DecodeChar? 0 = some c) :

b = (String.utf8EncodeChar c).toByteArray ++ b.extract c.utf8Size b.size

theorem ByteArray.exists_of_utf8DecodeChar?_eq_some {b : ByteArray} {c : Char} (h : b.utf8DecodeChar? 0 = some c) :

∃ (l : ByteArray ), b = (String.utf8EncodeChar c).toByteArray ++ l

theorem ByteArray.utf8DecodeChar?_eq_utf8DecodeChar?_extract {b : ByteArray} {i : Nat} :

b.utf8DecodeChar? i = (b.extract i b.size).utf8DecodeChar? 0

theorem ByteArray.le_size_of_utf8DecodeChar?_eq_some {b : ByteArray} {i : Nat} {c : Char} (h : b.utf8DecodeChar? i = some c) :

i + c.utf8Size ≤ b.size

theorem ByteArray.lt_size_of_isSome_utf8DecodeChar? {b : ByteArray} {i : Nat} (h : (b.utf8DecodeChar? i).isSome = true) :

i < b.size

theorem ByteArray.lt_size_of_validateUTF8At {b : ByteArray} {i : Nat} :

b.validateUTF8At i = true → i < b.size

theorem ByteArray.utf8DecodeChar?_append_eq_some {b : ByteArray} {i : Nat} {c : Char} (h : b.utf8DecodeChar? i = some c) (b' : ByteArray) :

(b ++ b').utf8DecodeChar? i = some c

theorem ByteArray.isSome_utf8DecodeChar?_append {b : ByteArray} {i : Nat} (h : (b.utf8DecodeChar? i).isSome = true) (b' : ByteArray) :

((b ++ b').utf8DecodeChar? i).isSome = true

@[inline]

def ByteArray.utf8DecodeChar (bytes : ByteArray) (i : Nat) (h : (bytes.utf8DecodeChar? i).isSome = true) :

Char

Decodes and returns the Char whose UTF-8 encoding begins at i in bytes.

This function requires a proof that there is, in fact, a valid Char at i. utf8DecodeChar? is an alternative function that returns Option Char instead of requiring a proof ahead of time.

Instances For