ctype-uca.c
上传用户:romrleung
上传日期:2022-05-23
资源大小:18897k
文件大小:440k
- else scans the next character and returns its first weight.
- Each character can have number weights from 0 to 8.
-
- Some characters do not have weights at all, 0 weights.
- It means they are ignored during comparison.
-
- Examples:
- 1. 0x0001 START OF HEADING, has no weights, ignored, does
- not produce any weights.
- 2. 0x0061 LATIN SMALL LETTER A, has one weight.
- 0x0E33 will be returned
- 3. 0x00DF LATIN SMALL LETTER SHARP S, aka SZ ligature,
- has two weights. It will return 0x0FEA twice for two
- consequent calls.
- 4. 0x247D PATENTHESIZED NUMBER TEN, has four weights,
- this function will return these numbers in four
- consequent calls: 0x0288, 0x0E2A, 0x0E29, 0x0289
- 5. A string consisting of the above characters:
- 0x0001 0x0061 0x00DF 0x247D
- will return the following weights, one weight per call:
- 0x0E33 0x0FEA 0x0FEA 0x0288, 0x0E2A, 0x0E29, 0x0289
-
- RETURN
- Next weight, a number between 0x0000 and 0xFFFF
- Or -1 on error (END-OF-STRING or ILLEGAL MULTIBYTE SEQUENCE)
- */
- static int my_uca_scanner_next_ucs2(my_uca_scanner *scanner)
- {
-
- /*
- Check if the weights for the previous character have been
- already fully scanned. If yes, then get the next character and
- initialize wbeg and wlength to its weight string.
- */
-
- if (scanner->wbeg[0])
- return *scanner->wbeg++;
-
- do
- {
- uint16 **ucaw= scanner->uca_weight;
- uchar *ucal= scanner->uca_length;
-
- if (scanner->sbeg > scanner->send)
- return -1;
-
- scanner->page= (unsigned char)scanner->sbeg[0];
- scanner->code= (unsigned char)scanner->sbeg[1];
- scanner->sbeg+= 2;
-
- if (scanner->contractions && (scanner->sbeg <= scanner->send))
- {
- int cweight;
-
- if (!scanner->page && !scanner->sbeg[0] &&
- (scanner->sbeg[1] > 0x40) && (scanner->sbeg[1] < 0x80) &&
- (scanner->code > 0x40) && (scanner->code < 0x80) &&
- (cweight= scanner->contractions[(scanner->code-0x40)*0x40+scanner->sbeg[1]-0x40]))
- {
- scanner->implicit[0]= 0;
- scanner->wbeg= scanner->implicit;
- scanner->sbeg+=2;
- return cweight;
- }
- }
-
- if (!ucaw[scanner->page])
- goto implicit;
- scanner->wbeg= ucaw[scanner->page] + scanner->code * ucal[scanner->page];
- } while (!scanner->wbeg[0]);
-
- return *scanner->wbeg++;
-
- implicit:
-
- scanner->code= (scanner->page << 8) + scanner->code;
- scanner->implicit[0]= (scanner->code & 0x7FFF) | 0x8000;
- scanner->implicit[1]= 0;
- scanner->wbeg= scanner->implicit;
-
- scanner->page= scanner->page >> 7;
-
- if (scanner->code >= 0x3400 && scanner->code <= 0x4DB5)
- scanner->page+= 0xFB80;
- else if (scanner->code >= 0x4E00 && scanner->code <= 0x9FA5)
- scanner->page+= 0xFB40;
- else
- scanner->page+= 0xFBC0;
-
- return scanner->page;
- }
- static my_uca_scanner_handler my_ucs2_uca_scanner_handler=
- {
- my_uca_scanner_init_ucs2,
- my_uca_scanner_next_ucs2
- };
- #endif
- /*
- The same two functions for any character set
- */
- static void my_uca_scanner_init_any(my_uca_scanner *scanner,
- CHARSET_INFO *cs __attribute__((unused)),
- const uchar *str, uint length)
- {
- /* Note, no needs to initialize scanner->wbeg */
- scanner->sbeg= str;
- scanner->send= str + length;
- scanner->wbeg= nochar;
- scanner->uca_length= cs->sort_order;
- scanner->uca_weight= cs->sort_order_big;
- scanner->contractions= cs->contractions;
- scanner->cs= cs;
- }
- static int my_uca_scanner_next_any(my_uca_scanner *scanner)
- {
-
- /*
- Check if the weights for the previous character have been
- already fully scanned. If yes, then get the next character and
- initialize wbeg and wlength to its weight string.
- */
-
- if (scanner->wbeg[0])
- return *scanner->wbeg++;
-
- do
- {
- uint16 **ucaw= scanner->uca_weight;
- uchar *ucal= scanner->uca_length;
- my_wc_t wc;
- int mblen;
-
- if (((mblen= scanner->cs->cset->mb_wc(scanner->cs, &wc,
- scanner->sbeg,
- scanner->send)) <= 0))
- return -1;
-
- scanner->page= wc >> 8;
- scanner->code= wc & 0xFF;
- scanner->sbeg+= mblen;
-
- if (scanner->contractions && !scanner->page &&
- (scanner->code > 0x40) && (scanner->code < 0x80))
- {
- uint page1, code1, cweight;
-
- if (((mblen= scanner->cs->cset->mb_wc(scanner->cs, &wc,
- scanner->sbeg,
- scanner->send)) >=0) &&
- (!(page1= (wc >> 8))) &&
- ((code1= (wc & 0xFF)) > 0x40) &&
- (code1 < 0x80) &&
- (cweight= scanner->contractions[(scanner->code-0x40)*0x40 + code1-0x40]))
- {
- scanner->implicit[0]= 0;
- scanner->wbeg= scanner->implicit;
- scanner->sbeg+= mblen;
- return cweight;
- }
- }
-
- if (!ucaw[scanner->page])
- goto implicit;
- scanner->wbeg= ucaw[scanner->page] + scanner->code * ucal[scanner->page];
- } while (!scanner->wbeg[0]);
-
- return *scanner->wbeg++;
-
- implicit:
-
- scanner->code= (scanner->page << 8) + scanner->code;
- scanner->implicit[0]= (scanner->code & 0x7FFF) | 0x8000;
- scanner->implicit[1]= 0;
- scanner->wbeg= scanner->implicit;
-
- scanner->page= scanner->page >> 7;
-
- if (scanner->code >= 0x3400 && scanner->code <= 0x4DB5)
- scanner->page+= 0xFB80;
- else if (scanner->code >= 0x4E00 && scanner->code <= 0x9FA5)
- scanner->page+= 0xFB40;
- else
- scanner->page+= 0xFBC0;
-
- return scanner->page;
- }
- static my_uca_scanner_handler my_any_uca_scanner_handler=
- {
- my_uca_scanner_init_any,
- my_uca_scanner_next_any
- };
- /*
- Compares two strings according to the collation
- SYNOPSIS:
- my_strnncoll_uca()
- cs Character set information
- s First string
- slen First string length
- t Second string
- tlen Seconf string length
-
- NOTES:
- Initializes two weight scanners and gets weights
- corresponding to two strings in a loop. If weights are not
- the same at some step then returns their difference.
-
- In the while() comparison these situations are possible:
- 1. (s_res>0) and (t_res>0) and (s_res == t_res)
- Weights are the same so far, continue comparison
- 2. (s_res>0) and (t_res>0) and (s_res!=t_res)
- A difference has been found, return.
- 3. (s_res>0) and (t_res<0)
- We have reached the end of the second string, or found
- an illegal multibyte sequence in the second string.
- Return a positive number, i.e. the first string is bigger.
- 4. (s_res<0) and (t_res>0)
- We have reached the end of the first string, or found
- an illegal multibyte sequence in the first string.
- Return a negative number, i.e. the second string is bigger.
- 5. (s_res<0) and (t_res<0)
- Both scanners returned -1. It means we have riched
- the end-of-string of illegal-sequence in both strings
- at the same time. Return 0, strings are equal.
-
- RETURN
- Difference between two strings, according to the collation:
- 0 - means strings are equal
- negative number - means the first string is smaller
- positive number - means the first string is bigger
- */
- static int my_strnncoll_uca(CHARSET_INFO *cs,
- my_uca_scanner_handler *scanner_handler,
- const uchar *s, uint slen,
- const uchar *t, uint tlen,
- my_bool t_is_prefix)
- {
- my_uca_scanner sscanner;
- my_uca_scanner tscanner;
- int s_res;
- int t_res;
-
- scanner_handler->init(&sscanner, cs, s, slen);
- scanner_handler->init(&tscanner, cs, t, tlen);
-
- do
- {
- s_res= scanner_handler->next(&sscanner);
- t_res= scanner_handler->next(&tscanner);
- } while ( s_res == t_res && s_res >0);
-
- return (t_is_prefix && t_res < 0) ? 0 : (s_res - t_res);
- }
- /*
- Compares two strings according to the collation,
- ignoring trailing spaces.
- SYNOPSIS:
- my_strnncollsp_uca()
- cs Character set information
- s First string
- slen First string length
- t Second string
- tlen Seconf string length
-
- NOTES:
- Works exactly the same with my_strnncoll_uca(),
- but ignores trailing spaces.
- In the while() comparison these situations are possible:
- 1. (s_res>0) and (t_res>0) and (s_res == t_res)
- Weights are the same so far, continue comparison
- 2. (s_res>0) and (t_res>0) and (s_res!=t_res)
- A difference has been found, return.
- 3. (s_res>0) and (t_res<0)
- We have reached the end of the second string, or found
- an illegal multibyte sequence in the second string.
- Compare the first string to an infinite array of
- space characters until difference is found, or until
- the end of the first string.
- 4. (s_res<0) and (t_res>0)
- We have reached the end of the first string, or found
- an illegal multibyte sequence in the first string.
- Compare the second string to an infinite array of
- space characters until difference is found or until
- the end of the second steing.
- 5. (s_res<0) and (t_res<0)
- Both scanners returned -1. It means we have riched
- the end-of-string of illegal-sequence in both strings
- at the same time. Return 0, strings are equal.
-
- RETURN
- Difference between two strings, according to the collation:
- 0 - means strings are equal
- negative number - means the first string is smaller
- positive number - means the first string is bigger
- */
- static int my_strnncollsp_uca(CHARSET_INFO *cs,
- my_uca_scanner_handler *scanner_handler,
- const uchar *s, uint slen,
- const uchar *t, uint tlen)
- {
- my_uca_scanner sscanner;
- my_uca_scanner tscanner;
- int s_res;
- int t_res;
-
- scanner_handler->init(&sscanner, cs, s, slen);
- scanner_handler->init(&tscanner, cs, t, tlen);
-
- do
- {
- s_res= scanner_handler->next(&sscanner);
- t_res= scanner_handler->next(&tscanner);
- } while ( s_res == t_res && s_res >0);
- if (s_res > 0 && t_res < 0)
- {
- /* Calculate weight for SPACE character */
- t_res= cs->sort_order_big[0][0x20 * cs->sort_order[0]];
-
- /* compare the first string to spaces */
- do
- {
- if (s_res != t_res)
- return (s_res - t_res);
- s_res= scanner_handler->next(&sscanner);
- } while (s_res > 0);
- return 0;
- }
-
- if (s_res < 0 && t_res > 0)
- {
- /* Calculate weight for SPACE character */
- s_res= cs->sort_order_big[0][0x20 * cs->sort_order[0]];
-
- /* compare the second string to spaces */
- do
- {
- if (s_res != t_res)
- return (s_res - t_res);
- t_res= scanner_handler->next(&tscanner);
- } while (t_res > 0);
- return 0;
- }
-
- return ( s_res - t_res );
- }
- /*
- Calculates hash value for the given string,
- according to the collation, and ignoring trailing spaces.
-
- SYNOPSIS:
- my_hash_sort_uca()
- cs Character set information
- s String
- slen String's length
- n1 First hash parameter
- n2 Second hash parameter
-
- NOTES:
- Scans consequently weights and updates
- hash parameters n1 and n2. In a case insensitive collation,
- upper and lower case of the same letter will return the same
- weight sequence, and thus will produce the same hash values
- in n1 and n2.
-
- RETURN
- N/A
- */
- static void my_hash_sort_uca(CHARSET_INFO *cs,
- my_uca_scanner_handler *scanner_handler,
- const uchar *s, uint slen,
- ulong *n1, ulong *n2)
- {
- int s_res;
- my_uca_scanner scanner;
-
- slen= cs->cset->lengthsp(cs, (char*) s, slen);
- scanner_handler->init(&scanner, cs, s, slen);
-
- while ((s_res= scanner_handler->next(&scanner)) >0)
- {
- n1[0]^= (((n1[0] & 63)+n2[0])*(s_res >> 8))+ (n1[0] << 8);
- n2[0]+=3;
- n1[0]^= (((n1[0] & 63)+n2[0])*(s_res & 0xFF))+ (n1[0] << 8);
- n2[0]+=3;
- }
- }
- /*
- For the given string creates its "binary image", suitable
- to be used in binary comparison, i.e. in memcmp().
-
- SYNOPSIS:
- my_strnxfrm_uca()
- cs Character set information
- dst Where to write the image
- dstlen Space available for the image, in bytes
- src The source string
- srclen Length of the source string, in bytes
-
- NOTES:
- In a loop, scans weights from the source string and writes
- them into the binary image. In a case insensitive collation,
- upper and lower cases of the same letter will produce the
- same image subsequences. When we have reached the end-of-string
- or found an illegal multibyte sequence, the loop stops.
- It is impossible to restore the original string using its
- binary image.
-
- Binary images are used for bulk comparison purposes,
- e.g. in ORDER BY, when it is more efficient to create
- a binary image and use it instead of weight scanner
- for the original strings for every comparison.
-
- RETURN
- Number of bytes that have been written into the binary image.
- */
- static int my_strnxfrm_uca(CHARSET_INFO *cs,
- my_uca_scanner_handler *scanner_handler,
- uchar *dst, uint dstlen,
- const uchar *src, uint srclen)
- {
- uchar *de = dst + (dstlen & (uint) ~1); /* add even length for easier code */
- int s_res;
- my_uca_scanner scanner;
- scanner_handler->init(&scanner, cs, src, srclen);
-
- while (dst < de && (s_res= scanner_handler->next(&scanner)) >0)
- {
- dst[0]= s_res >> 8;
- dst[1]= s_res & 0xFF;
- dst+= 2;
- }
- s_res= cs->sort_order_big[0][0x20 * cs->sort_order[0]];
- while (dst < de)
- {
- dst[0]= s_res >> 8;
- dst[1]= s_res & 0xFF;
- dst+= 2;
- }
- if (dstlen & 1) /* if odd number then fill the last char */
- *dst= ' ';
-
- return dstlen;
- }
- /*
- This function compares if two characters are the same.
- The sign +1 or -1 does not matter. The only
- important thing is that the result is 0 or not 0.
- This fact allows us to use memcmp() safely, on both
- little-endian and big-endian machines.
- */
- static int my_uca_charcmp(CHARSET_INFO *cs, my_wc_t wc1, my_wc_t wc2)
- {
- size_t page1= wc1 >> MY_UCA_PSHIFT;
- size_t page2= wc2 >> MY_UCA_PSHIFT;
- uchar *ucal= cs->sort_order;
- uint16 **ucaw= cs->sort_order_big;
- size_t length1= ucal[page1];
- size_t length2= ucal[page2];
- uint16 *weight1= ucaw[page1] + (wc1 & MY_UCA_CMASK) * ucal[page1];
- uint16 *weight2= ucaw[page2] + (wc2 & MY_UCA_CMASK) * ucal[page2];
-
- if (!weight1 || !weight2)
- return wc1 != wc2;
-
- if (length1 > length2)
- return memcmp((const void*)weight1, (const void*)weight2, length2*2) ?
- 1: weight1[length2];
-
- if (length1 < length2)
- return memcmp((const void*)weight1, (const void*)weight2, length1*2) ?
- 1 : weight2[length1];
-
- return memcmp((const void*)weight1, (const void*)weight2, length1*2);
- }
- /*
- ** Compare string against string with wildcard
- ** 0 if matched
- ** -1 if not matched with wildcard
- ** 1 if matched with wildcard
- */
- static
- int my_wildcmp_uca(CHARSET_INFO *cs,
- const char *str,const char *str_end,
- const char *wildstr,const char *wildend,
- int escape, int w_one, int w_many)
- {
- int result= -1; /* Not found, using wildcards */
- my_wc_t s_wc, w_wc;
- int scan;
- int (*mb_wc)(struct charset_info_st *cs, my_wc_t *wc,
- const unsigned char *s,const unsigned char *e);
- mb_wc= cs->cset->mb_wc;
-
- while (wildstr != wildend)
- {
- while (1)
- {
- my_bool escaped= 0;
- if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
- (const uchar*)wildend)) <= 0)
- return 1;
- if (w_wc == (my_wc_t)w_many)
- {
- result= 1; /* Found an anchor char */
- break;
- }
- wildstr+= scan;
- if (w_wc == (my_wc_t)escape)
- {
- if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
- (const uchar*)wildend)) <= 0)
- return 1;
- wildstr+= scan;
- escaped= 1;
- }
-
- if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
- (const uchar*)str_end)) <= 0)
- return 1;
- str+= scan;
-
- if (!escaped && w_wc == (my_wc_t)w_one)
- {
- result= 1; /* Found an anchor char */
- }
- else
- {
- if (my_uca_charcmp(cs,s_wc,w_wc))
- return 1;
- }
- if (wildstr == wildend)
- return (str != str_end); /* Match if both are at end */
- }
-
-
- if (w_wc == (my_wc_t)w_many)
- { /* Found w_many */
-
- /* Remove any '%' and '_' from the wild search string */
- for ( ; wildstr != wildend ; )
- {
- if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
- (const uchar*)wildend)) <= 0)
- return 1;
-
- if (w_wc == (my_wc_t)w_many)
- {
- wildstr+= scan;
- continue;
- }
-
- if (w_wc == (my_wc_t)w_one)
- {
- wildstr+= scan;
- if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
- (const uchar*)str_end)) <= 0)
- return 1;
- str+= scan;
- continue;
- }
- break; /* Not a wild character */
- }
-
- if (wildstr == wildend)
- return 0; /* Ok if w_many is last */
-
- if (str == str_end)
- return -1;
-
- if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
- (const uchar*)wildend)) <= 0)
- return 1;
-
- if (w_wc == (my_wc_t)escape)
- {
- wildstr+= scan;
- if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
- (const uchar*)wildend)) <= 0)
- return 1;
- }
-
- while (1)
- {
- /* Skip until the first character from wildstr is found */
- while (str != str_end)
- {
- if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
- (const uchar*)str_end)) <= 0)
- return 1;
-
- if (!my_uca_charcmp(cs,s_wc,w_wc))
- break;
- str+= scan;
- }
- if (str == str_end)
- return -1;
-
- result= my_wildcmp_uca(cs, str, str_end, wildstr, wildend,
- escape, w_one, w_many);
-
- if (result <= 0)
- return result;
-
- str+= scan;
- }
- }
- }
- return (str != str_end ? 1 : 0);
- }
- /*
- Collation language is implemented according to
- subset of ICU Collation Customization (tailorings):
- http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
-
- Collation language elements:
- Delimiters:
- space - skipped
-
- <char> := A-Z | a-z | uXXXX
-
- Shift command:
- <shift> := & - reset at this letter.
-
- Diff command:
- <d1> := < - Identifies a primary difference.
- <d2> := << - Identifies a secondary difference.
- <d3> := <<< - Idenfifies a tertiary difference.
-
-
- Collation rules:
- <ruleset> := <rule> { <ruleset> }
-
- <rule> := <d1> <string>
- | <d2> <string>
- | <d3> <string>
- | <shift> <char>
-
- <string> := <char> [ <string> ]
- An example, Polish collation:
-
- &A < u0105 <<< u0104
- &C < u0107 <<< u0106
- &E < u0119 <<< u0118
- &L < u0142 <<< u0141
- &N < u0144 <<< u0143
- &O < u00F3 <<< u00D3
- &S < u015B <<< u015A
- &Z < u017A <<< u017B
- */
- typedef enum my_coll_lexem_num_en
- {
- MY_COLL_LEXEM_EOF = 0,
- MY_COLL_LEXEM_DIFF = 1,
- MY_COLL_LEXEM_SHIFT = 4,
- MY_COLL_LEXEM_CHAR = 5,
- MY_COLL_LEXEM_ERROR = 6
- } my_coll_lexem_num;
- typedef struct my_coll_lexem_st
- {
- const char *beg;
- const char *end;
- const char *prev;
- int diff;
- int code;
- } MY_COLL_LEXEM;
- /*
- Initialize collation rule lexical anilizer
-
- SYNOPSIS
- my_coll_lexem_init
- lexem Lex analizer to init
- str Const string to parse
- strend End of the string
- USAGE
-
- RETURN VALUES
- N/A
- */
- static void my_coll_lexem_init(MY_COLL_LEXEM *lexem,
- const char *str, const char *strend)
- {
- lexem->beg= str;
- lexem->prev= str;
- lexem->end= strend;
- lexem->diff= 0;
- lexem->code= 0;
- }
- /*
- Print collation customization expression parse error, with context.
-
- SYNOPSIS
- my_coll_lexem_print_error
- lexem Lex analizer to take context from
- errstr sting to write error to
- errsize errstr size
- txt error message
- USAGE
-
- RETURN VALUES
- N/A
- */
- static void my_coll_lexem_print_error(MY_COLL_LEXEM *lexem,
- char *errstr, size_t errsize,
- const char *txt)
- {
- char tail[30];
- size_t len= lexem->end - lexem->prev;
- strmake (tail, lexem->prev, min(len, sizeof(tail)-1));
- errstr[errsize-1]= ' ';
- my_snprintf(errstr,errsize-1,"%s at '%s'", txt, tail);
- }
- /*
- Convert a hex digit into its numeric value
-
- SYNOPSIS
- ch2x
- ch hex digit to convert
- USAGE
-
- RETURN VALUES
- an integer value in the range 0..15
- -1 on error
- */
- static int ch2x(int ch)
- {
- if (ch >= '0' && ch <= '9')
- return ch - '0';
-
- if (ch >= 'a' && ch <= 'f')
- return 10 + ch - 'a';
-
- if (ch >= 'A' && ch <= 'F')
- return 10 + ch - 'A';
-
- return -1;
- }
- /*
- Collation language lexical parser:
- Scans the next lexem.
-
- SYNOPSIS
- my_coll_lexem_next
- lexem Lex analizer, previously initialized by
- my_coll_lexem_init.
- USAGE
- Call this function in a loop
-
- RETURN VALUES
- Lexem number: eof, diff, shift, char or error.
- */
- static my_coll_lexem_num my_coll_lexem_next(MY_COLL_LEXEM *lexem)
- {
- const char *beg;
- my_coll_lexem_num rc;
-
- for (beg= lexem->beg ; beg < lexem->end ; beg++)
- {
- if (*beg == ' ' || *beg == 't' || *beg == 'r' || *beg == 'n')
- continue;
-
- if (*beg == '&')
- {
- beg++;
- rc= MY_COLL_LEXEM_SHIFT;
- goto ex;
- }
-
- if (beg[0] == '<')
- {
- for (beg++, lexem->diff= 1;
- (beg < lexem->end) &&
- (*beg == '<') && (lexem->diff<3);
- beg++, lexem->diff++);
- rc= MY_COLL_LEXEM_DIFF;
- goto ex;
- }
-
- if ((*beg >= 'a' && *beg <= 'z') || (*beg >= 'A' && *beg <= 'Z'))
- {
- lexem->code= *beg++;
- rc= MY_COLL_LEXEM_CHAR;
- goto ex;
- }
-
- if ((*beg == '\') && (beg+2 < lexem->end) && (beg[1] == 'u'))
- {
- int ch;
-
- beg+= 2;
- lexem->code= 0;
- while ((beg < lexem->end) && ((ch= ch2x(beg[0])) >= 0))
- {
- lexem->code= (lexem->code << 4) + ch;
- beg++;
- }
- rc= MY_COLL_LEXEM_CHAR;
- goto ex;
- }
-
- rc= MY_COLL_LEXEM_ERROR;
- goto ex;
- }
- rc= MY_COLL_LEXEM_EOF;
-
- ex:
- lexem->prev= lexem->beg;
- lexem->beg= beg;
- return rc;
- }
- /*
- Collation rule item
- */
- typedef struct my_coll_rule_item_st
- {
- uint base; /* Base character */
- uint curr[2]; /* Current character */
- int diff[3]; /* Primary, Secondary and Tertiary difference */
- } MY_COLL_RULE;
- /*
- Collation language syntax parser.
- Uses lexical parser.
-
- SYNOPSIS
- my_coll_rule_parse
- rule Collation rule list to load to.
- str A string containin collation language expression.
- strend End of the string.
- USAGE
-
- RETURN VALUES
- A positive number means the number of rules loaded.
- -1 means ERROR, e.g. too many items, syntax error, etc.
- */
- static int my_coll_rule_parse(MY_COLL_RULE *rule, size_t mitems,
- const char *str, const char *strend,
- char *errstr, size_t errsize)
- {
- MY_COLL_LEXEM lexem;
- my_coll_lexem_num lexnum;
- my_coll_lexem_num prevlexnum= MY_COLL_LEXEM_ERROR;
- MY_COLL_RULE item;
- int state= 0;
- size_t nitems= 0;
-
- /* Init all variables */
- errstr[0]= ' ';
- bzero(&item, sizeof(item));
- my_coll_lexem_init(&lexem, str, strend);
-
- while ((lexnum= my_coll_lexem_next(&lexem)))
- {
- if (lexnum == MY_COLL_LEXEM_ERROR)
- {
- my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Unknown character");
- return -1;
- }
-
- switch (state) {
- case 0:
- if (lexnum != MY_COLL_LEXEM_SHIFT)
- {
- my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& expected");
- return -1;
- }
- prevlexnum= lexnum;
- state= 2;
- continue;
-
- case 1:
- if (lexnum != MY_COLL_LEXEM_SHIFT && lexnum != MY_COLL_LEXEM_DIFF)
- {
- my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& or < expected");
- return -1;
- }
- prevlexnum= lexnum;
- state= 2;
- continue;
-
- case 2:
- if (lexnum != MY_COLL_LEXEM_CHAR)
- {
- my_coll_lexem_print_error(&lexem,errstr,errsize-1,"character expected");
- return -1;
- }
-
- if (prevlexnum == MY_COLL_LEXEM_SHIFT)
- {
- item.base= lexem.code;
- item.diff[0]= 0;
- item.diff[1]= 0;
- item.diff[2]= 0;
- }
- else if (prevlexnum == MY_COLL_LEXEM_DIFF)
- {
- MY_COLL_LEXEM savlex;
- savlex= lexem;
- item.curr[0]= lexem.code;
- if ((lexnum= my_coll_lexem_next(&lexem)) == MY_COLL_LEXEM_CHAR)
- {
- item.curr[1]= lexem.code;
- }
- else
- {
- item.curr[1]= 0;
- lexem=savlex; /* Restore previous parser state */
- }
- if (lexem.diff == 3)
- {
- item.diff[2]++;
- }
- else if (lexem.diff == 2)
- {
- item.diff[1]++;
- item.diff[2]= 0;
- }
- else if (lexem.diff == 1)
- {
- item.diff[0]++;
- item.diff[1]= 0;
- item.diff[2]= 0;
- }
- if (nitems >= mitems)
- {
- my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Too many rules");
- return -1;
- }
- rule[nitems++]= item;
- }
- else
- {
- my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Should never happen");
- return -1;
- }
- state= 1;
- continue;
- }
- }
- return (size_t) nitems;
- }
- #define MY_MAX_COLL_RULE 128
- /*
- This function copies an UCS2 collation from
- the default Unicode Collation Algorithm (UCA)
- weights applying tailorings, i.e. a set of
- alternative weights for some characters.
-
- The default UCA weights are stored in uca_weight/uca_length.
- They consist of 256 pages, 256 character each.
-
- If a page is not overwritten by tailoring rules,
- it is copies as is from UCA as is.
-
- If a page contains some overwritten characters, it is
- allocated. Untouched characters are copied from the
- default weights.
- */
- static my_bool create_tailoring(CHARSET_INFO *cs, void *(*alloc)(uint))
- {
- MY_COLL_RULE rule[MY_MAX_COLL_RULE];
- char errstr[128];
- uchar *newlengths;
- uint16 **newweights;
- const uchar *deflengths= uca_length;
- uint16 **defweights= uca_weight;
- int rc, i;
- int ncontractions= 0;
-
- if (!cs->tailoring)
- return 1;
-
- /* Parse ICU Collation Customization expression */
- if ((rc= my_coll_rule_parse(rule, MY_MAX_COLL_RULE,
- cs->tailoring,
- cs->tailoring + strlen(cs->tailoring),
- errstr, sizeof(errstr))) < 0)
- {
- /*
- TODO: add error message reporting.
- printf("Error: %d '%s'n", rc, errstr);
- */
- return 1;
- }
-
- if (!(newweights= (uint16**) (*alloc)(256*sizeof(uint16*))))
- return 1;
- bzero(newweights, 256*sizeof(uint16*));
-
- if (!(newlengths= (uchar*) (*alloc)(256)))
- return 1;
-
- memcpy(newlengths, deflengths, 256);
-
- /*
- Calculate maximum lenghts for the pages
- which will be overwritten.
- */
- for (i=0; i < rc; i++)
- {
- if (!rule[i].curr[1]) /* If not a contraction */
- {
- uint pageb= (rule[i].base >> 8) & 0xFF;
- uint pagec= (rule[i].curr[0] >> 8) & 0xFF;
-
- if (newlengths[pagec] < deflengths[pageb])
- newlengths[pagec]= deflengths[pageb];
- }
- else
- ncontractions++;
- }
-
- for (i=0; i < rc; i++)
- {
- uint pageb= (rule[i].base >> 8) & 0xFF;
- uint pagec= (rule[i].curr[0] >> 8) & 0xFF;
- uint chb, chc;
-
- if (rule[i].curr[1]) /* Skip contraction */
- continue;
-
- if (!newweights[pagec])
- {
- /* Alloc new page and copy the default UCA weights */
- uint size= 256*newlengths[pagec]*sizeof(uint16);
-
- if (!(newweights[pagec]= (uint16*) (*alloc)(size)))
- return 1;
- bzero((void*) newweights[pagec], size);
-
- for (chc=0 ; chc < 256; chc++)
- {
- memcpy(newweights[pagec] + chc*newlengths[pagec],
- defweights[pagec] + chc*deflengths[pagec],
- deflengths[pagec]*sizeof(uint16));
- }
- }
-
- /*
- Aply the alternative rule:
- shift to the base character and primary difference.
- */
- chc= rule[i].curr[0] & 0xFF;
- chb= rule[i].base & 0xFF;
- memcpy(newweights[pagec] + chc*newlengths[pagec],
- defweights[pageb] + chb*deflengths[pageb],
- deflengths[pageb]*sizeof(uint16));
- /* Apply primary difference */
- newweights[pagec][chc*newlengths[pagec]]+= rule[i].diff[0];
- }
-
- /* Copy non-overwritten pages from the default UCA weights */
- for (i= 0; i < 256 ; i++)
- {
- if (!newweights[i])
- newweights[i]= defweights[i];
- }
-
- cs->sort_order= newlengths;
- cs->sort_order_big= newweights;
- cs->contractions= NULL;
-
- /* Now process contractions */
- if (ncontractions)
- {
- uint size= 0x40*0x40*sizeof(uint16); /* 8K, for basic latin letter only */
- if (!(cs->contractions= (uint16*) (*alloc)(size)))
- return 1;
- bzero((void*)cs->contractions, size);
- for (i=0; i < rc; i++)
- {
- if (rule[i].curr[1])
- {
- uint pageb= (rule[i].base >> 8) & 0xFF;
- uint chb= rule[i].base & 0xFF;
- uint16 *offsb= defweights[pageb] + chb*deflengths[pageb];
- uint offsc;
-
- if (offsb[1] ||
- rule[i].curr[0] < 0x40 || rule[i].curr[0] > 0x7f ||
- rule[i].curr[1] < 0x40 || rule[i].curr[1] > 0x7f)
- {
- /*
- TODO: add error reporting;
- We support only basic latin letters contractions at this point.
- Also, We don't support contractions with weight longer than one.
- Otherwise, we'd need much more memory.
- */
- return 1;
- }
- offsc= (rule[i].curr[0]-0x40)*0x40+(rule[i].curr[1]-0x40);
-
- /* Copy base weight applying primary difference */
- cs->contractions[offsc]= offsb[0] + rule[i].diff[0];
- }
- }
- }
- return 0;
- }
- /*
- Universal CHARSET_INFO compatible wrappers
- for the above internal functions.
- Should work for any character set.
- */
- static my_bool my_coll_init_uca(CHARSET_INFO *cs, void *(*alloc)(uint))
- {
- return create_tailoring(cs, alloc);
- }
- static int my_strnncoll_any_uca(CHARSET_INFO *cs,
- const uchar *s, uint slen,
- const uchar *t, uint tlen,
- my_bool t_is_prefix)
- {
- return my_strnncoll_uca(cs, &my_any_uca_scanner_handler,
- s, slen, t, tlen, t_is_prefix);
- }
- static int my_strnncollsp_any_uca(CHARSET_INFO *cs,
- const uchar *s, uint slen,
- const uchar *t, uint tlen)
- {
- return my_strnncollsp_uca(cs, &my_any_uca_scanner_handler,
- s, slen, t, tlen);
- }
- static void my_hash_sort_any_uca(CHARSET_INFO *cs,
- const uchar *s, uint slen,
- ulong *n1, ulong *n2)
- {
- my_hash_sort_uca(cs, &my_any_uca_scanner_handler, s, slen, n1, n2);
- }
- static int my_strnxfrm_any_uca(CHARSET_INFO *cs,
- uchar *dst, uint dstlen,
- const uchar *src, uint srclen)
- {
- return my_strnxfrm_uca(cs, &my_any_uca_scanner_handler,
- dst, dstlen, src, srclen);
- }
- #ifdef HAVE_CHARSET_ucs2
- /*
- UCS2 optimized CHARSET_INFO compatible wrappers.
- */
- static int my_strnncoll_ucs2_uca(CHARSET_INFO *cs,
- const uchar *s, uint slen,
- const uchar *t, uint tlen,
- my_bool t_is_prefix)
- {
- return my_strnncoll_uca(cs, &my_ucs2_uca_scanner_handler,
- s, slen, t, tlen, t_is_prefix);
- }
- static int my_strnncollsp_ucs2_uca(CHARSET_INFO *cs,
- const uchar *s, uint slen,
- const uchar *t, uint tlen)
- {
- return my_strnncollsp_uca(cs, &my_ucs2_uca_scanner_handler,
- s, slen, t, tlen);
- }
- static void my_hash_sort_ucs2_uca(CHARSET_INFO *cs,
- const uchar *s, uint slen,
- ulong *n1, ulong *n2)
- {
- my_hash_sort_uca(cs, &my_ucs2_uca_scanner_handler, s, slen, n1, n2);
- }
- static int my_strnxfrm_ucs2_uca(CHARSET_INFO *cs,
- uchar *dst, uint dstlen,
- const uchar *src, uint srclen)
- {
- return my_strnxfrm_uca(cs, &my_ucs2_uca_scanner_handler,
- dst, dstlen, src, srclen);
- }
- MY_COLLATION_HANDLER my_collation_ucs2_uca_handler =
- {
- my_coll_init_uca, /* init */
- my_strnncoll_ucs2_uca,
- my_strnncollsp_ucs2_uca,
- my_strnxfrm_ucs2_uca,
- my_like_range_ucs2,
- my_wildcmp_uca,
- NULL,
- my_instr_mb,
- my_hash_sort_ucs2_uca
- };
- CHARSET_INFO my_charset_ucs2_general_uca=
- {
- 128,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "ucs2", /* cs name */
- "ucs2_unicode_ci", /* name */
- "", /* comment */
- "", /* tailoring */
- NULL, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- uca_length, /* sort_order */
- NULL, /* contractions */
- uca_weight, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 2, /* mbminlen */
- 2, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_ucs2_handler,
- &my_collation_ucs2_uca_handler
- };
- CHARSET_INFO my_charset_ucs2_icelandic_uca_ci=
- {
- 129,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "ucs2", /* cs name */
- "ucs2_icelandic_ci",/* name */
- "", /* comment */
- icelandic, /* tailoring */
- NULL, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 2, /* mbminlen */
- 2, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_ucs2_handler,
- &my_collation_ucs2_uca_handler
- };
- CHARSET_INFO my_charset_ucs2_latvian_uca_ci=
- {
- 130,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "ucs2", /* cs name */
- "ucs2_latvian_ci", /* name */
- "", /* comment */
- latvian, /* tailoring */
- NULL, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 2, /* mbminlen */
- 2, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_ucs2_handler,
- &my_collation_ucs2_uca_handler
- };
- CHARSET_INFO my_charset_ucs2_romanian_uca_ci=
- {
- 131,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "ucs2", /* cs name */
- "ucs2_romanian_ci", /* name */
- "", /* comment */
- romanian, /* tailoring */
- NULL, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 2, /* mbminlen */
- 2, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_ucs2_handler,
- &my_collation_ucs2_uca_handler
- };
- CHARSET_INFO my_charset_ucs2_slovenian_uca_ci=
- {
- 132,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "ucs2", /* cs name */
- "ucs2_slovenian_ci",/* name */
- "", /* comment */
- slovenian, /* tailoring */
- NULL, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 2, /* mbminlen */
- 2, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_ucs2_handler,
- &my_collation_ucs2_uca_handler
- };
- CHARSET_INFO my_charset_ucs2_polish_uca_ci=
- {
- 133,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "ucs2", /* cs name */
- "ucs2_polish_ci", /* name */
- "", /* comment */
- polish, /* tailoring */
- NULL, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 2, /* mbminlen */
- 2, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_ucs2_handler,
- &my_collation_ucs2_uca_handler
- };
- CHARSET_INFO my_charset_ucs2_estonian_uca_ci=
- {
- 134,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "ucs2", /* cs name */
- "ucs2_estonian_ci", /* name */
- "", /* comment */
- estonian, /* tailoring */
- NULL, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 2, /* mbminlen */
- 2, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_ucs2_handler,
- &my_collation_ucs2_uca_handler
- };
- CHARSET_INFO my_charset_ucs2_spanish_uca_ci=
- {
- 135,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "ucs2", /* cs name */
- "ucs2_spanish_ci", /* name */
- "", /* comment */
- spanish, /* tailoring */
- NULL, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 2, /* mbminlen */
- 2, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_ucs2_handler,
- &my_collation_ucs2_uca_handler
- };
- CHARSET_INFO my_charset_ucs2_swedish_uca_ci=
- {
- 136,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "ucs2", /* cs name */
- "ucs2_swedish_ci", /* name */
- "", /* comment */
- swedish, /* tailoring */
- NULL, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 2, /* mbminlen */
- 2, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_ucs2_handler,
- &my_collation_ucs2_uca_handler
- };
- CHARSET_INFO my_charset_ucs2_turkish_uca_ci=
- {
- 137,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "ucs2", /* cs name */
- "ucs2_turkish_ci", /* name */
- "", /* comment */
- turkish, /* tailoring */
- NULL, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 2, /* mbminlen */
- 2, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_ucs2_handler,
- &my_collation_ucs2_uca_handler
- };
- CHARSET_INFO my_charset_ucs2_czech_uca_ci=
- {
- 138,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "ucs2", /* cs name */
- "ucs2_czech_ci", /* name */
- "", /* comment */
- czech, /* tailoring */
- NULL, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 2, /* mbminlen */
- 2, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_ucs2_handler,
- &my_collation_ucs2_uca_handler
- };
- CHARSET_INFO my_charset_ucs2_danish_uca_ci=
- {
- 139,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "ucs2", /* cs name */
- "ucs2_danish_ci", /* name */
- "", /* comment */
- danish, /* tailoring */
- NULL, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 2, /* mbminlen */
- 2, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_ucs2_handler,
- &my_collation_ucs2_uca_handler
- };
- CHARSET_INFO my_charset_ucs2_lithuanian_uca_ci=
- {
- 140,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "ucs2", /* cs name */
- "ucs2_lithuanian_ci",/* name */
- "", /* comment */
- lithuanian, /* tailoring */
- NULL, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 2, /* mbminlen */
- 2, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_ucs2_handler,
- &my_collation_ucs2_uca_handler
- };
- CHARSET_INFO my_charset_ucs2_slovak_uca_ci=
- {
- 141,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "ucs2", /* cs name */
- "ucs2_slovak_ci", /* name */
- "", /* comment */
- slovak, /* tailoring */
- NULL, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 2, /* mbminlen */
- 2, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_ucs2_handler,
- &my_collation_ucs2_uca_handler
- };
- CHARSET_INFO my_charset_ucs2_spanish2_uca_ci=
- {
- 142,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "ucs2", /* cs name */
- "ucs2_spanish2_ci", /* name */
- "", /* comment */
- spanish2, /* tailoring */
- NULL, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 2, /* mbminlen */
- 2, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_ucs2_handler,
- &my_collation_ucs2_uca_handler
- };
- CHARSET_INFO my_charset_ucs2_roman_uca_ci=
- {
- 143,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "ucs2", /* cs name */
- "ucs2_roman_ci", /* name */
- "", /* comment */
- roman, /* tailoring */
- NULL, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 2, /* mbminlen */
- 2, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_ucs2_handler,
- &my_collation_ucs2_uca_handler
- };
- CHARSET_INFO my_charset_ucs2_persian_uca_ci=
- {
- 144,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "ucs2", /* cs name */
- "ucs2_persian_ci", /* name */
- "", /* comment */
- persian, /* tailoring */
- NULL, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 2, /* mbminlen */
- 2, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_ucs2_handler,
- &my_collation_ucs2_uca_handler
- };
- #endif
- #ifdef HAVE_CHARSET_utf8
- MY_COLLATION_HANDLER my_collation_any_uca_handler =
- {
- my_coll_init_uca, /* init */
- my_strnncoll_any_uca,
- my_strnncollsp_any_uca,
- my_strnxfrm_any_uca,
- my_like_range_mb,
- my_wildcmp_uca,
- NULL,
- my_instr_mb,
- my_hash_sort_any_uca
- };
- /*
- We consider bytes with code more than 127 as a letter.
- This garantees that word boundaries work fine with regular
- expressions. Note, there is no need to mark byte 255 as a
- letter, it is illegal byte in UTF8.
- */
- static uchar ctype_utf8[] = {
- 0,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 40, 40, 40, 40, 40, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 72, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
- 132,132,132,132,132,132,132,132,132,132, 16, 16, 16, 16, 16, 16,
- 16,129,129,129,129,129,129, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 16, 16, 16, 16, 16,
- 16,130,130,130,130,130,130, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 16, 16, 16, 16, 32,
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0
- };
- extern MY_CHARSET_HANDLER my_charset_utf8_handler;
- CHARSET_INFO my_charset_utf8_general_uca_ci=
- {
- 192,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "utf8", /* cs name */
- "utf8_unicode_ci", /* name */
- "", /* comment */
- "", /* tailoring */
- ctype_utf8, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- uca_length, /* sort_order */
- NULL, /* contractions */
- uca_weight, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 1, /* mbminlen */
- 3, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_utf8_handler,
- &my_collation_any_uca_handler
- };
- CHARSET_INFO my_charset_utf8_icelandic_uca_ci=
- {
- 193,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "utf8", /* cs name */
- "utf8_icelandic_ci",/* name */
- "", /* comment */
- icelandic, /* tailoring */
- ctype_utf8, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 1, /* mbminlen */
- 3, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_utf8_handler,
- &my_collation_any_uca_handler
- };
- CHARSET_INFO my_charset_utf8_latvian_uca_ci=
- {
- 194,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "utf8", /* cs name */
- "utf8_latvian_ci", /* name */
- "", /* comment */
- latvian, /* tailoring */
- ctype_utf8, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 1, /* mbminlen */
- 3, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_utf8_handler,
- &my_collation_any_uca_handler
- };
- CHARSET_INFO my_charset_utf8_romanian_uca_ci=
- {
- 195,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "utf8", /* cs name */
- "utf8_romanian_ci", /* name */
- "", /* comment */
- romanian, /* tailoring */
- ctype_utf8, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 1, /* mbminlen */
- 3, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_utf8_handler,
- &my_collation_any_uca_handler
- };
- CHARSET_INFO my_charset_utf8_slovenian_uca_ci=
- {
- 196,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "utf8", /* cs name */
- "utf8_slovenian_ci",/* name */
- "", /* comment */
- slovenian, /* tailoring */
- ctype_utf8, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 1, /* mbminlen */
- 3, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_utf8_handler,
- &my_collation_any_uca_handler
- };
- CHARSET_INFO my_charset_utf8_polish_uca_ci=
- {
- 197,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "utf8", /* cs name */
- "utf8_polish_ci", /* name */
- "", /* comment */
- polish, /* tailoring */
- ctype_utf8, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 1, /* mbminlen */
- 3, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_utf8_handler,
- &my_collation_any_uca_handler
- };
- CHARSET_INFO my_charset_utf8_estonian_uca_ci=
- {
- 198,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "utf8", /* cs name */
- "utf8_estonian_ci", /* name */
- "", /* comment */
- estonian, /* tailoring */
- ctype_utf8, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 1, /* mbminlen */
- 3, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_utf8_handler,
- &my_collation_any_uca_handler
- };
- CHARSET_INFO my_charset_utf8_spanish_uca_ci=
- {
- 199,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "utf8", /* cs name */
- "utf8_spanish_ci", /* name */
- "", /* comment */
- spanish, /* tailoring */
- ctype_utf8, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 1, /* mbminlen */
- 3, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_utf8_handler,
- &my_collation_any_uca_handler
- };
- CHARSET_INFO my_charset_utf8_swedish_uca_ci=
- {
- 200,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "utf8", /* cs name */
- "utf8_swedish_ci", /* name */
- "", /* comment */
- swedish, /* tailoring */
- ctype_utf8, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 1, /* mbminlen */
- 3, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_utf8_handler,
- &my_collation_any_uca_handler
- };
- CHARSET_INFO my_charset_utf8_turkish_uca_ci=
- {
- 201,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "utf8", /* cs name */
- "utf8_turkish_ci", /* name */
- "", /* comment */
- turkish, /* tailoring */
- ctype_utf8, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 1, /* mbminlen */
- 3, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_utf8_handler,
- &my_collation_any_uca_handler
- };
- CHARSET_INFO my_charset_utf8_czech_uca_ci=
- {
- 202,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "utf8", /* cs name */
- "utf8_czech_ci", /* name */
- "", /* comment */
- czech, /* tailoring */
- ctype_utf8, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 1, /* mbminlen */
- 3, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_utf8_handler,
- &my_collation_any_uca_handler
- };
- CHARSET_INFO my_charset_utf8_danish_uca_ci=
- {
- 203,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "utf8", /* cs name */
- "utf8_danish_ci", /* name */
- "", /* comment */
- danish, /* tailoring */
- ctype_utf8, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 1, /* mbminlen */
- 3, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_utf8_handler,
- &my_collation_any_uca_handler
- };
- CHARSET_INFO my_charset_utf8_lithuanian_uca_ci=
- {
- 204,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "utf8", /* cs name */
- "utf8_lithuanian_ci",/* name */
- "", /* comment */
- lithuanian, /* tailoring */
- ctype_utf8, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 1, /* mbminlen */
- 3, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_utf8_handler,
- &my_collation_any_uca_handler
- };
- CHARSET_INFO my_charset_utf8_slovak_uca_ci=
- {
- 205,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "utf8", /* cs name */
- "utf8_slovak_ci", /* name */
- "", /* comment */
- slovak, /* tailoring */
- ctype_utf8, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 1, /* mbminlen */
- 3, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_utf8_handler,
- &my_collation_any_uca_handler
- };
- CHARSET_INFO my_charset_utf8_spanish2_uca_ci=
- {
- 206,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "utf8", /* cs name */
- "utf8_spanish2_ci", /* name */
- "", /* comment */
- spanish2, /* tailoring */
- ctype_utf8, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 1, /* mbminlen */
- 3, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_utf8_handler,
- &my_collation_any_uca_handler
- };
- CHARSET_INFO my_charset_utf8_roman_uca_ci=
- {
- 207,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "utf8", /* cs name */
- "utf8_roman_ci", /* name */
- "", /* comment */
- roman, /* tailoring */
- ctype_utf8, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 1, /* mbminlen */
- 3, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_utf8_handler,
- &my_collation_any_uca_handler
- };
- CHARSET_INFO my_charset_utf8_persian_uca_ci=
- {
- 208,0,0, /* number */
- MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE,
- "utf8", /* cs name */
- "utf8_persian_ci", /* name */
- "", /* comment */
- persian, /* tailoring */
- ctype_utf8, /* ctype */
- NULL, /* to_lower */
- NULL, /* to_upper */
- NULL, /* sort_order */
- NULL, /* contractions */
- NULL, /* sort_order_big*/
- NULL, /* tab_to_uni */
- NULL, /* tab_from_uni */
- NULL, /* state_map */
- NULL, /* ident_map */
- 8, /* strxfrm_multiply */
- 1, /* mbminlen */
- 3, /* mbmaxlen */
- 9, /* min_sort_char */
- 0xFFFF, /* max_sort_char */
- 0, /* escape_with_backslash_is_dangerous */
- &my_charset_utf8_handler,
- &my_collation_any_uca_handler
- };
- #endif /* HAVE_CHARSET_utf8 */
- #endif /* HAVE_UCA_COLLATIONS */