Add real implementation.
authordrepper <drepper>
Sun, 14 Mar 2004 20:52:47 +0000 (20:52 +0000)
committerdrepper <drepper>
Sun, 14 Mar 2004 20:52:47 +0000 (20:52 +0000)
string/strxfrm_l.c

index 264ab9b..44b6051 100644 (file)
@@ -1,4 +1,4 @@
-/* Copyright (C) 1995,96,97,2002 Free Software Foundation, Inc.
+/* Copyright (C) 1995,96,97,2002, 2004 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Written by Ulrich Drepper <drepper@gnu.org>, 1995.
 
    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
    02111-1307 USA.  */
 
-#define USE_IN_EXTENDED_LOCALE_MODEL   1
-#include <strxfrm.c>
+#include <assert.h>
+#include <langinfo.h>
+#include <locale.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/param.h>
 
+#ifndef STRING_TYPE
+# define STRING_TYPE char
+# define USTRING_TYPE unsigned char
+# define STRXFRM __strxfrm_l
+# define STRCMP strcmp
+# define STRLEN strlen
+# define STPNCPY __stpncpy
+# define WEIGHT_H "../locale/weight.h"
+# define SUFFIX        MB
+# define L(arg) arg
+#endif
+
+#define CONCAT(a,b) CONCAT1(a,b)
+#define CONCAT1(a,b) a##b
+
+#include "../locale/localeinfo.h"
+
+
+#ifndef WIDE_CHAR_VERSION
+
+/* We need UTF-8 encoding of numbers.  */
+static int
+utf8_encode (char *buf, int val)
+{
+  int retval;
+
+  if (val < 0x80)
+    {
+      *buf++ = (char) val;
+      retval = 1;
+    }
+  else
+    {
+      int step;
+
+      for (step = 2; step < 6; ++step)
+       if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
+         break;
+      retval = step;
+
+      *buf = (unsigned char) (~0xff >> step);
+      --step;
+      do
+       {
+         buf[step] = 0x80 | (val & 0x3f);
+         val >>= 6;
+       }
+      while (--step > 0);
+      *buf |= val;
+    }
+
+  return retval;
+}
+#endif
+
+
+size_t
+STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
+{
+  struct locale_data *current = l->__locales[LC_COLLATE];
+  uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
+  /* We don't assign the following values right away since it might be
+     unnecessary in case there are no rules.  */
+  const unsigned char *rulesets;
+  const int32_t *table;
+  const USTRING_TYPE *weights;
+  const USTRING_TYPE *extra;
+  const int32_t *indirect;
+  uint_fast32_t pass;
+  size_t needed;
+  const USTRING_TYPE *usrc;
+  size_t srclen = STRLEN (src);
+  int32_t *idxarr;
+  unsigned char *rulearr;
+  size_t idxmax;
+  size_t idxcnt;
+  int use_malloc;
+
+#include WEIGHT_H
+
+  if (nrules == 0)
+    {
+      if (n != 0)
+       STPNCPY (dest, src, MIN (srclen + 1, n));
+
+      return srclen;
+    }
+
+  rulesets = (const unsigned char *)
+    current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
+  table = (const int32_t *)
+    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
+  weights = (const USTRING_TYPE *)
+    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
+  extra = (const USTRING_TYPE *)
+    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
+  indirect = (const int32_t *)
+    current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
+  use_malloc = 0;
+
+  assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
+  assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
+  assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
+  assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
+
+  /* Handle an empty string as a special case.  */
+  if (srclen == 0)
+    {
+      if (n != 0)
+        *dest = L('\0');
+      return 0;
+    }
+
+  /* We need the elements of the string as unsigned values since they
+     are used as indeces.  */
+  usrc = (const USTRING_TYPE *) src;
+
+  /* Perform the first pass over the string and while doing this find
+     and store the weights for each character.  Since we want this to
+     be as fast as possible we are using `alloca' to store the temporary
+     values.  But since there is no limit on the length of the string
+     we have to use `malloc' if the string is too long.  We should be
+     very conservative here.  */
+  if (! __libc_use_alloca (srclen))
+    {
+      idxarr = (int32_t *) malloc ((srclen + 1) * (sizeof (int32_t) + 1));
+      rulearr = (unsigned char *) &idxarr[srclen];
+
+      if (idxarr == NULL)
+       /* No memory.  Well, go with the stack then.
+
+          XXX Once this implementation is stable we will handle this
+          differently.  Instead of precomputing the indeces we will
+          do this in time.  This means, though, that this happens for
+          every pass again.  */
+       goto try_stack;
+      use_malloc = 1;
+    }
+  else
+    {
+    try_stack:
+      idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
+      rulearr = (unsigned char *) alloca (srclen + 1);
+    }
+
+  idxmax = 0;
+  do
+    {
+      int32_t tmp = findidx (&usrc);
+      rulearr[idxmax] = tmp >> 24;
+      idxarr[idxmax] = tmp & 0xffffff;
+
+      ++idxmax;
+    }
+  while (*usrc != L('\0'));
+
+  /* This element is only read, the value never used but to determine
+     another value which then is ignored.  */
+  rulearr[idxmax] = '\0';
+
+  /* Now the passes over the weights.  We now use the indeces we found
+     before.  */
+  needed = 0;
+  for (pass = 0; pass < nrules; ++pass)
+    {
+      size_t backw_stop = ~0ul;
+      int rule = rulesets[rulearr[0] * nrules + pass];
+      /* We assume that if a rule has defined `position' in one section
+        this is true for all of them.  */
+      int position = rule & sort_position;
+
+      if (position == 0)
+       {
+         for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
+           {
+             if ((rule & sort_forward) != 0)
+               {
+                 size_t len;
+
+                 if (backw_stop != ~0ul)
+                   {
+                     /* Handle the pushed elements now.  */
+                     size_t backw;
+
+                     for (backw = idxcnt - 1; backw >= backw_stop; --backw)
+                       {
+                         len = weights[idxarr[backw]++];
+
+                         if (needed + len < n)
+                           while (len-- > 0)
+                             dest[needed++] = weights[idxarr[backw]++];
+                         else
+                           {
+                               /* No more characters fit into the buffer.  */
+                             needed += len;
+                             idxarr[backw] += len;
+                           }
+                       }
+
+                     backw_stop = ~0ul;
+                   }
+
+                 /* Now handle the forward element.  */
+                 len = weights[idxarr[idxcnt]++];
+                 if (needed + len < n)
+                   while (len-- > 0)
+                     dest[needed++] = weights[idxarr[idxcnt]++];
+                 else
+                   {
+                     /* No more characters fit into the buffer.  */
+                     needed += len;
+                     idxarr[idxcnt] += len;
+                   }
+               }
+             else
+               {
+                 /* Remember where the backwards series started.  */
+                 if (backw_stop == ~0ul)
+                   backw_stop = idxcnt;
+               }
+
+             rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
+           }
+
+
+         if (backw_stop != ~0ul)
+           {
+             /* Handle the pushed elements now.  */
+             size_t backw;
+
+             backw = idxcnt;
+             while (backw > backw_stop)
+               {
+                 size_t len = weights[idxarr[--backw]++];
+
+                 if (needed + len < n)
+                   while (len-- > 0)
+                     dest[needed++] = weights[idxarr[backw]++];
+                 else
+                   {
+                     /* No more characters fit into the buffer.  */
+                     needed += len;
+                     idxarr[backw] += len;
+                   }
+               }
+           }
+       }
+      else
+       {
+         int val = 1;
+#ifndef WIDE_CHAR_VERSION
+         char buf[7];
+         size_t buflen;
+#endif
+         size_t i;
+
+         for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
+           {
+             if ((rule & sort_forward) != 0)
+               {
+                 size_t len;
+
+                 if (backw_stop != ~0ul)
+                   {
+                    /* Handle the pushed elements now.  */
+                     size_t backw;
+
+                     for (backw = idxcnt - 1; backw >= backw_stop; --backw)
+                       {
+                         len = weights[idxarr[backw]++];
+                         if (len != 0)
+                           {
+#ifdef WIDE_CHAR_VERSION
+                             if (needed + 1 + len < n)
+                               {
+                                 dest[needed] = val;
+                                 for (i = 0; i < len; ++i)
+                                   dest[needed + 1 + i] =
+                                     weights[idxarr[backw] + i];
+                               }
+                             needed += 1 + len;
+#else
+                             buflen = utf8_encode (buf, val);
+                             if (needed + buflen + len < n)
+                               {
+                                 for (i = 0; i < buflen; ++i)
+                                   dest[needed + i] = buf[i];
+                                 for (i = 0; i < len; ++i)
+                                   dest[needed + buflen + i] =
+                                     weights[idxarr[backw] + i];
+                               }
+                             needed += buflen + len;
+#endif
+                             idxarr[backw] += len;
+                             val = 1;
+                           }
+                         else
+                           ++val;
+                       }
+
+                     backw_stop = ~0ul;
+                   }
+
+                 /* Now handle the forward element.  */
+                 len = weights[idxarr[idxcnt]++];
+                 if (len != 0)
+                   {
+#ifdef WIDE_CHAR_VERSION
+                     if (needed + 1+ len < n)
+                       {
+                         dest[needed] = val;
+                         for (i = 0; i < len; ++i)
+                           dest[needed + 1 + i] =
+                             weights[idxarr[idxcnt] + i];
+                       }
+                     needed += 1 + len;
+#else
+                     buflen = utf8_encode (buf, val);
+                     if (needed + buflen + len < n)
+                       {
+                         for (i = 0; i < buflen; ++i)
+                           dest[needed + i] = buf[i];
+                         for (i = 0; i < len; ++i)
+                           dest[needed + buflen + i] =
+                             weights[idxarr[idxcnt] + i];
+                       }
+                     needed += buflen + len;
+#endif
+                     idxarr[idxcnt] += len;
+                     val = 1;
+                   }
+                 else
+                   /* Note that we don't have to increment `idxarr[idxcnt]'
+                      since the length is zero.  */
+                   ++val;
+               }
+             else
+               {
+                 /* Remember where the backwards series started.  */
+                 if (backw_stop == ~0ul)
+                   backw_stop = idxcnt;
+               }
+
+             rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
+           }
+
+         if (backw_stop != ~0ul)
+           {
+             /* Handle the pushed elements now.  */
+             size_t backw;
+
+             backw = idxmax - 1;
+             while (backw > backw_stop)
+               {
+                 size_t len = weights[idxarr[--backw]++];
+                 if (len != 0)
+                   {
+#ifdef WIDE_CHAR_VERSION
+                     if (needed + 1 + len < n)
+                       {
+                         dest[needed] = val;
+                         for (i = 0; i < len; ++i)
+                           dest[needed + 1 + i] =
+                             weights[idxarr[backw] + i];
+                       }
+                     needed += 1 + len;
+#else
+                     buflen = utf8_encode (buf, val);
+                     if (needed + buflen + len < n)
+                       {
+                         for (i = 0; i < buflen; ++i)
+                           dest[needed + i] = buf[i];
+                         for (i = 0; i < len; ++i)
+                           dest[needed + buflen + i] =
+                             weights[idxarr[backw] + i];
+                       }
+                     needed += buflen + len;
+#endif
+                     idxarr[backw] += len;
+                     val = 1;
+                   }
+                 else
+                   ++val;
+               }
+           }
+       }
+
+      /* Finally store the byte to separate the passes or terminate
+        the string.  */
+      if (needed < n)
+       dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
+      ++needed;
+    }
+
+  /* This is a little optimization: many collation specifications have
+     a `position' rule at the end and if no non-ignored character
+     is found the last \1 byte is immediately followed by a \0 byte
+     signalling this.  We can avoid the \1 byte(s).  */
+  if (needed <= n && needed > 2 && dest[needed - 2] == L('\1'))
+    {
+      /* Remove the \1 byte.  */
+      --needed;
+      dest[needed - 1] = L('\0');
+    }
+
+  /* Free the memory if needed.  */
+  if (use_malloc)
+    free (idxarr);
+
+  /* Return the number of bytes/words we need, but don't count the NUL
+     byte/word at the end.  */
+  return needed - 1;
+}
+libc_hidden_def (STRXFRM)
+
+#ifndef WIDE_CHAR_VERSION
 weak_alias (__strxfrm_l, strxfrm_l)
+#endif