?login_element?

Subversion Repositories NedoOS

Rev

Blame | Last modification | View Log | Download

  1. /*
  2. ** $Id: lutf8lib.c $
  3. ** Standard library for UTF-8 manipulation
  4. ** See Copyright Notice in lua.h
  5. */
  6.  
  7. #define lutf8lib_c
  8. #define LUA_LIB
  9.  
  10. #include "lprefix.h"
  11.  
  12.  
  13. #include <assert.h>
  14. #include <limits.h>
  15. #include <stdlib.h>
  16. #include <string.h>
  17.  
  18. #include "lua.h"
  19.  
  20. #include "lauxlib.h"
  21. #include "lualib.h"
  22.  
  23.  
  24. #define MAXUNICODE      0x10FFFFu
  25.  
  26. #define MAXUTF          0x7FFFFFFFu
  27.  
  28. /*
  29. ** Integer type for decoded UTF-8 values; MAXUTF needs 31 bits.
  30. */
  31. #if (UINT_MAX >> 30) >= 1
  32. typedef unsigned int utfint;
  33. #else
  34. typedef unsigned long utfint;
  35. #endif
  36.  
  37.  
  38. #define iscont(p)       ((*(p) & 0xC0) == 0x80)
  39.  
  40.  
  41. /* from strlib */
  42. /* translate a relative string position: negative means back from end */
  43. static lua_Integer u_posrelat (lua_Integer pos, size_t len) {
  44.   if (pos >= 0) return pos;
  45.   else if (0u - (size_t)pos > len) return 0;
  46.   else return (lua_Integer)len + pos + 1;
  47. }
  48.  
  49.  
  50. /*
  51. ** Decode one UTF-8 sequence, returning NULL if byte sequence is
  52. ** invalid.  The array 'limits' stores the minimum value for each
  53. ** sequence length, to check for overlong representations. Its first
  54. ** entry forces an error for non-ascii bytes with no continuation
  55. ** bytes (count == 0).
  56. */
  57. static const char *utf8_decode (const char *s, utfint *val, int strict) {
  58.   static const utfint limits[] =
  59.         {~(utfint)0, 0x80, 0x800, 0x10000u, 0x200000u, 0x4000000u};
  60.   unsigned int c = (unsigned char)s[0];
  61.   utfint res = 0;  /* final result */
  62.   if (c < 0x80)  /* ascii? */
  63.     res = c;
  64.   else {
  65.     int count = 0;  /* to count number of continuation bytes */
  66.     for (; c & 0x40; c <<= 1) {  /* while it needs continuation bytes... */
  67.       unsigned int cc = (unsigned char)s[++count];  /* read next byte */
  68.       if ((cc & 0xC0) != 0x80)  /* not a continuation byte? */
  69.         return NULL;  /* invalid byte sequence */
  70.       res = (res << 6) | (cc & 0x3F);  /* add lower 6 bits from cont. byte */
  71.     }
  72.     res |= ((utfint)(c & 0x7F) << (count * 5));  /* add first byte */
  73.     if (count > 5 || res > MAXUTF || res < limits[count])
  74.       return NULL;  /* invalid byte sequence */
  75.     s += count;  /* skip continuation bytes read */
  76.   }
  77.   if (strict) {
  78.     /* check for invalid code points; too large or surrogates */
  79.     if (res > MAXUNICODE || (0xD800u <= res && res <= 0xDFFFu))
  80.       return NULL;
  81.   }
  82.   if (val) *val = res;
  83.   return s + 1;  /* +1 to include first byte */
  84. }
  85.  
  86.  
  87. /*
  88. ** utf8len(s [, i [, j [, lax]]]) --> number of characters that
  89. ** start in the range [i,j], or nil + current position if 's' is not
  90. ** well formed in that interval
  91. */
  92. static int utflen (lua_State *L) {
  93.   lua_Integer n = 0;  /* counter for the number of characters */
  94.   size_t len;  /* string length in bytes */
  95.   const char *s = luaL_checklstring(L, 1, &len);
  96.   lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
  97.   lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len);
  98.   int lax = lua_toboolean(L, 4);
  99.   luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
  100.                    "initial position out of bounds");
  101.   luaL_argcheck(L, --posj < (lua_Integer)len, 3,
  102.                    "final position out of bounds");
  103.   while (posi <= posj) {
  104.     const char *s1 = utf8_decode(s + posi, NULL, !lax);
  105.     if (s1 == NULL) {  /* conversion error? */
  106.       luaL_pushfail(L);  /* return fail ... */
  107.       lua_pushinteger(L, posi + 1);  /* ... and current position */
  108.       return 2;
  109.     }
  110.     posi = s1 - s;
  111.     n++;
  112.   }
  113.   lua_pushinteger(L, n);
  114.   return 1;
  115. }
  116.  
  117.  
  118. /*
  119. ** codepoint(s, [i, [j [, lax]]]) -> returns codepoints for all
  120. ** characters that start in the range [i,j]
  121. */
  122. static int codepoint (lua_State *L) {
  123.   size_t len;
  124.   const char *s = luaL_checklstring(L, 1, &len);
  125.   lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
  126.   lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len);
  127.   int lax = lua_toboolean(L, 4);
  128.   int n;
  129.   const char *se;
  130.   luaL_argcheck(L, posi >= 1, 2, "out of bounds");
  131.   luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of bounds");
  132.   if (posi > pose) return 0;  /* empty interval; return no values */
  133.   if (pose - posi >= INT_MAX)  /* (lua_Integer -> int) overflow? */
  134.     return luaL_error(L, "string slice too long");
  135.   n = (int)(pose -  posi) + 1;  /* upper bound for number of returns */
  136.   luaL_checkstack(L, n, "string slice too long");
  137.   n = 0;  /* count the number of returns */
  138.   se = s + pose;  /* string end */
  139.   for (s += posi - 1; s < se;) {
  140.     utfint code;
  141.     s = utf8_decode(s, &code, !lax);
  142.     if (s == NULL)
  143.       return luaL_error(L, "invalid UTF-8 code");
  144.     lua_pushinteger(L, code);
  145.     n++;
  146.   }
  147.   return n;
  148. }
  149.  
  150.  
  151. static void pushutfchar (lua_State *L, int arg) {
  152.   lua_Unsigned code = (lua_Unsigned)luaL_checkinteger(L, arg);
  153.   luaL_argcheck(L, code <= MAXUTF, arg, "value out of range");
  154.   lua_pushfstring(L, "%U", (long)code);
  155. }
  156.  
  157.  
  158. /*
  159. ** utfchar(n1, n2, ...)  -> char(n1)..char(n2)...
  160. */
  161. static int utfchar (lua_State *L) {
  162.   int n = lua_gettop(L);  /* number of arguments */
  163.   if (n == 1)  /* optimize common case of single char */
  164.     pushutfchar(L, 1);
  165.   else {
  166.     int i;
  167.     luaL_Buffer b;
  168.     luaL_buffinit(L, &b);
  169.     for (i = 1; i <= n; i++) {
  170.       pushutfchar(L, i);
  171.       luaL_addvalue(&b);
  172.     }
  173.     luaL_pushresult(&b);
  174.   }
  175.   return 1;
  176. }
  177.  
  178.  
  179. /*
  180. ** offset(s, n, [i])  -> index where n-th character counting from
  181. **   position 'i' starts; 0 means character at 'i'.
  182. */
  183. static int byteoffset (lua_State *L) {
  184.   size_t len;
  185.   const char *s = luaL_checklstring(L, 1, &len);
  186.   lua_Integer n  = luaL_checkinteger(L, 2);
  187.   lua_Integer posi = (n >= 0) ? 1 : len + 1;
  188.   posi = u_posrelat(luaL_optinteger(L, 3, posi), len);
  189.   luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3,
  190.                    "position out of bounds");
  191.   if (n == 0) {
  192.     /* find beginning of current byte sequence */
  193.     while (posi > 0 && iscont(s + posi)) posi--;
  194.   }
  195.   else {
  196.     if (iscont(s + posi))
  197.       return luaL_error(L, "initial position is a continuation byte");
  198.     if (n < 0) {
  199.        while (n < 0 && posi > 0) {  /* move back */
  200.          do {  /* find beginning of previous character */
  201.            posi--;
  202.          } while (posi > 0 && iscont(s + posi));
  203.          n++;
  204.        }
  205.      }
  206.      else {
  207.        n--;  /* do not move for 1st character */
  208.        while (n > 0 && posi < (lua_Integer)len) {
  209.          do {  /* find beginning of next character */
  210.            posi++;
  211.          } while (iscont(s + posi));  /* (cannot pass final '\0') */
  212.          n--;
  213.        }
  214.      }
  215.   }
  216.   if (n == 0)  /* did it find given character? */
  217.     lua_pushinteger(L, posi + 1);
  218.   else  /* no such character */
  219.     luaL_pushfail(L);
  220.   return 1;
  221. }
  222.  
  223.  
  224. static int iter_aux (lua_State *L, int strict) {
  225.   size_t len;
  226.   const char *s = luaL_checklstring(L, 1, &len);
  227.   lua_Unsigned n = (lua_Unsigned)lua_tointeger(L, 2);
  228.   if (n < len) {
  229.     while (iscont(s + n)) n++;  /* skip continuation bytes */
  230.   }
  231.   if (n >= len)  /* (also handles original 'n' being negative) */
  232.     return 0;  /* no more codepoints */
  233.   else {
  234.     utfint code;
  235.     const char *next = utf8_decode(s + n, &code, strict);
  236.     if (next == NULL)
  237.       return luaL_error(L, "invalid UTF-8 code");
  238.     lua_pushinteger(L, n + 1);
  239.     lua_pushinteger(L, code);
  240.     return 2;
  241.   }
  242. }
  243.  
  244.  
  245. static int iter_auxstrict (lua_State *L) {
  246.   return iter_aux(L, 1);
  247. }
  248.  
  249. static int iter_auxlax (lua_State *L) {
  250.   return iter_aux(L, 0);
  251. }
  252.  
  253.  
  254. static int iter_codes (lua_State *L) {
  255.   int lax = lua_toboolean(L, 2);
  256.   luaL_checkstring(L, 1);
  257.   lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict);
  258.   lua_pushvalue(L, 1);
  259.   lua_pushinteger(L, 0);
  260.   return 3;
  261. }
  262.  
  263.  
  264. /* pattern to match a single UTF-8 character */
  265. #define UTF8PATT        "[\0-\x7F\xC2-\xFD][\x80-\xBF]*"
  266.  
  267.  
  268. static const luaL_Reg funcs[] = {
  269.   {"offset", byteoffset},
  270.   {"codepoint", codepoint},
  271.   {"char", utfchar},
  272.   {"len", utflen},
  273.   {"codes", iter_codes},
  274.   /* placeholders */
  275.   {"charpattern", NULL},
  276.   {NULL, NULL}
  277. };
  278.  
  279.  
  280. LUAMOD_API int luaopen_utf8 (lua_State *L) {
  281.   luaL_newlib(L, funcs);
  282.   lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)/sizeof(char) - 1);
  283.   lua_setfield(L, -2, "charpattern");
  284.   return 1;
  285. }
  286.  
  287.