1 -- Provides UTF-8 aware string functions implemented in pure lua:
3 -- * string.utf8sub(s, i, j)
5 -- All functions behave as their non UTF-8 aware counterparts with the exception
6 -- that UTF-8 characters are used instead of bytes for all units.
8 -- Note: all validations had been removed due to awesome usage specifics.
10 Copyright (c) 2006-2007, Kyle Smith
11 Modified by Alexander Yakushev, 2010-2013.
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22 * Neither the name of the author nor the names of its contributors may be
23 used to endorse or promote products derived from this software without
24 specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
29 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
30 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
32 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
33 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
34 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40 -- UTF8-octets = *( UTF8-char )
41 -- UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
43 -- UTF8-2 = %xC2-DF UTF8-tail
44 -- UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
45 -- %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
46 -- UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
47 -- %xF4 %x80-8F 2( UTF8-tail )
48 -- UTF8-tail = %x80-BF
51 -- returns the number of bytes used by the UTF-8 character at byte i in s
52 -- also doubles as a UTF-8 character validator
56 function utf8.charbytes (s, i)
59 local c = string.byte(s, i)
61 -- determine bytes needed for character, based on RFC 3629
62 if c > 0 and c <= 127 then
65 elseif c >= 194 and c <= 223 then
67 local c2 = string.byte(s, i + 1)
69 elseif c >= 224 and c <= 239 then
71 local c2 = s:byte(i + 1)
72 local c3 = s:byte(i + 2)
74 elseif c >= 240 and c <= 244 then
76 local c2 = s:byte(i + 1)
77 local c3 = s:byte(i + 2)
78 local c4 = s:byte(i + 3)
83 -- returns the number of characters in a UTF-8 string
86 local bytes = string.len(s)
89 while pos <= bytes and len ~= chars do
90 local c = string.byte(s,pos)
93 pos = pos + utf8.charbytes(s, pos)
103 -- functions identically to string.sub except that i and j are UTF-8 characters
105 function utf8.sub (s, i, j)
113 local bytes = string.len(s)
116 -- only set l if i or j is negative
117 local l = (i >= 0 and j >= 0) or utf8.len(s)
118 local startChar = (i >= 0) and i or l + i + 1
119 local endChar = (j >= 0) and j or l + j + 1
121 -- can't have start before end!
122 if startChar > endChar then
126 -- byte offsets to pass to string.sub
127 local startByte, endByte = 1, bytes
129 while pos <= bytes do
132 if len == startChar then
136 pos = pos + utf8.charbytes(s, pos)
138 if len == endChar then
144 return string.sub(s, startByte, endByte)
147 -- replace UTF-8 characters based on a mapping table
148 function utf8.replace (s, mapping)
150 local bytes = string.len(s)
154 while pos <= bytes do
155 charbytes = utf8.charbytes(s, pos)
156 local c = string.sub(s, pos, pos + charbytes - 1)
157 newstr = newstr .. (mapping[c] or c)
158 pos = pos + charbytes