awesome/widgets/awesompd/utf8.lua

   1 -- Provides UTF-8 aware string functions implemented in pure lua:
   2 -- * string.utf8len(s)
   3 -- * string.utf8sub(s, i, j)
   4 --
   5 -- All functions behave as their non UTF-8 aware counterparts with the exception
   6 -- that UTF-8 characters are used instead of bytes for all units.
   7 --
   8 -- Note: all validations had been removed due to awesome usage specifics.
   9 --[[
  10 Copyright (c) 2006-2007, Kyle Smith
  11 Modified by Alexander Yakushev, 2010-2013.
  12 All rights reserved.
  13
  14 Redistribution and use in source and binary forms, with or without
  15 modification, are permitted provided that the following conditions are met:
  16
  17 * Redistributions of source code must retain the above copyright notice,
  18 this list of conditions and the following disclaimer.
  19 * Redistributions in binary form must reproduce the above copyright
  20 notice, this list of conditions and the following disclaimer in the
  21 documentation and/or other materials provided with the distribution.
  22 * Neither the name of the author nor the names of its contributors may be
  23 used to endorse or promote products derived from this software without
  24 specific prior written permission.
  25
  26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  29 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
  30 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  32 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  33 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  34 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  35 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  36 --]]
  37
  38 -- ABNF from RFC 3629
  39 --
  40 -- UTF8-octets = *( UTF8-char )
  41 -- UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
  42 -- UTF8-1 = %x00-7F
  43 -- UTF8-2 = %xC2-DF UTF8-tail
  44 -- UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
  45 -- %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
  46 -- UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
  47 -- %xF4 %x80-8F 2( UTF8-tail )
  48 -- UTF8-tail = %x80-BF
  49 --
  50
  51 -- returns the number of bytes used by the UTF-8 character at byte i in s
  52 -- also doubles as a UTF-8 character validator
  53
  54 local utf8 = {}
  55
  56 function utf8.charbytes (s, i)
  57    -- argument defaults
  58    i = i or 1
  59    local c = string.byte(s, i)
  60
  61    -- determine bytes needed for character, based on RFC 3629
  62    if c > 0 and c <= 127 then
  63       -- UTF8-1
  64       return 1
  65    elseif c >= 194 and c <= 223 then
  66       -- UTF8-2
  67       local c2 = string.byte(s, i + 1)
  68       return 2
  69    elseif c >= 224 and c <= 239 then
  70       -- UTF8-3
  71       local c2 = s:byte(i + 1)
  72       local c3 = s:byte(i + 2)
  73       return 3
  74    elseif c >= 240 and c <= 244 then
  75       -- UTF8-4
  76       local c2 = s:byte(i + 1)
  77       local c3 = s:byte(i + 2)
  78       local c4 = s:byte(i + 3)
  79       return 4
  80    end
  81 end
  82
  83 -- returns the number of characters in a UTF-8 string
  84 function utf8.len (s)
  85    local pos = 1
  86    local bytes = string.len(s)
  87    local len = 0
  88
  89    while pos <= bytes and len ~= chars do
  90       local c = string.byte(s,pos)
  91       len = len + 1
  92
  93       pos = pos + utf8.charbytes(s, pos)
  94    end
  95
  96    if chars ~= nil then
  97       return pos - 1
  98    end
  99
 100    return len
 101 end
 102
 103 -- functions identically to string.sub except that i and j are UTF-8 characters
 104 -- instead of bytes
 105 function utf8.sub (s, i, j)
 106    j = j or -1
 107
 108    if i == nil then
 109       return ""
 110    end
 111
 112    local pos = 1
 113    local bytes = string.len(s)
 114    local len = 0
 115
 116    -- only set l if i or j is negative
 117    local l = (i >= 0 and j >= 0) or utf8.len(s)
 118    local startChar = (i >= 0) and i or l + i + 1
 119    local endChar = (j >= 0) and j or l + j + 1
 120
 121    -- can't have start before end!
 122    if startChar > endChar then
 123       return ""
 124    end
 125
 126    -- byte offsets to pass to string.sub
 127    local startByte, endByte = 1, bytes
 128
 129    while pos <= bytes do
 130       len = len + 1
 131
 132       if len == startChar then
 133          startByte = pos
 134       end
 135
 136       pos = pos + utf8.charbytes(s, pos)
 137
 138       if len == endChar then
 139          endByte = pos - 1
 140          break
 141       end
 142    end
 143
 144    return string.sub(s, startByte, endByte)
 145 end
 146
 147 -- replace UTF-8 characters based on a mapping table
 148 function utf8.replace (s, mapping)
 149    local pos = 1
 150    local bytes = string.len(s)
 151    local charbytes
 152    local newstr = ""
 153
 154    while pos <= bytes do
 155       charbytes = utf8.charbytes(s, pos)
 156       local c = string.sub(s, pos, pos + charbytes - 1)
 157       newstr = newstr .. (mapping[c] or c)
 158       pos = pos + charbytes
 159    end
 160
 161    return newstr
 162 end
 163
 164 return utf8