validate utf-8 in swift

To validate if a string is a valid UTF-8 encoded string in Swift, we can make use of the String API's utf8 view and iterate over it to check if each byte of the string is valid.

Here is an implementation of a function that takes a string and returns a boolean value indicating if it is a valid UTF-8 encoded string:

main.swift
func isValidUTF8(_ input: String) -> Bool {
    let utf8 = input.utf8
    var index = utf8.startIndex

    while index < utf8.endIndex {
        let codeUnit = utf8[index]

        if (codeUnit & 0x80) == 0 {
            // this is a single-byte UTF-8 character
            index = utf8.index(after: index)
        } else if (codeUnit & 0xE0) == 0xC0 {
            // this is a two-byte UTF-8 character
            let nextIndex = utf8.index(after: index)
            guard nextIndex < utf8.endIndex, (utf8[nextIndex] & 0xC0) == 0x80 else {
                return false
            }
            index = utf8.index(after: nextIndex)
        } else if (codeUnit & 0xF0) == 0xE0 {
            // this is a three-byte UTF-8 character
            let nextIndex1 = utf8.index(after: index)
            let nextIndex2 = utf8.index(after: nextIndex1)
            guard nextIndex2 < utf8.endIndex, 
                  (utf8[nextIndex1] & 0xC0) == 0x80, 
                  (utf8[nextIndex2] & 0xC0) == 0x80 else {
                return false
            }
            index = utf8.index(after: nextIndex2)
        } else if (codeUnit & 0xF8) == 0xF0 {
            // this is a four-byte UTF-8 character
            let nextIndex1 = utf8.index(after: index)
            let nextIndex2 = utf8.index(after: nextIndex1)
            let nextIndex3 = utf8.index(after: nextIndex2)
            guard nextIndex3 < utf8.endIndex, 
                  (utf8[nextIndex1] & 0xC0) == 0x80,
                  (utf8[nextIndex2] & 0xC0) == 0x80,
                  (utf8[nextIndex3] & 0xC0) == 0x80 else {
                return false
            }
            index = utf8.index(after: nextIndex3)
        } else {
            // this is not a valid UTF-8 character
            return false
        }
    }

    return true
}
1799 chars
48 lines

The function checks if each byte of the string belongs to a valid UTF-8 character or not. If all the bytes in the string are valid UTF-8 characters, the function returns true, otherwise, it returns false.

gistlibby LogSnag