Rename and slightly redefine the default text search parser's "word"

categories, as per discussion. asciiword (formerly lword) is still ASCII-letters-only, and numword (formerly word) is still the most general mixed-alpha-and-digits case. But word (formerly nlword) is now any-group-of-letters-with-at-least-one-non-ASCII, rather than all-non-ASCII as before. This is no worse than before for parsing mixed Russian/English text, which seems to have been the design center for the original coding; and it should simplify matters for parsing most European languages. In particular it will not be necessary for any language to accept strings containing digits as being regular "words". The hyphenated-word categories are adjusted similarly.
2007-10-23 20:46:12 +00:00 · 2007-10-23 20:46:12 +00:00 · dbaec70c15
commit dbaec70c15
parent 344d0cae64
10 changed files with 464 additions and 447 deletions
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.403 2007/10/22 20:13:37 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.404 2007/10/23 20:46:11 tgl Exp $ -->

 <chapter id="functions">
  <title>Functions and Operators</title>
@ -7861,7 +7861,7 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
        <entry><type>setof record</type></entry>
        <entry>test a configuration</entry>
        <entry><literal>ts_debug('english', 'The Brightest supernovaes')</literal></entry>
-        <entry><literal>(lword,"Latin word",The,{english_stem},english_stem,{}) ...</literal></entry>
+        <entry><literal>(asciiword,"Word, all ASCII",The,{english_stem},english_stem,{}) ...</literal></entry>
       </row>
       <row>
        <entry><literal><function>ts_lexize</function>(<replaceable class="PARAMETER">dict</replaceable> <type>regdictionary</>, <replaceable class="PARAMETER">token</replaceable> <type>text</>)</literal></entry>
@ -7889,14 +7889,14 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
        <entry><type>setof record</type></entry>
        <entry>get token types defined by parser</entry>
        <entry><literal>ts_token_type('default')</literal></entry>
-        <entry><literal>(1,lword,"Latin word") ...</literal></entry>
+        <entry><literal>(1,asciiword,"Word, all ASCII") ...</literal></entry>
       </row>
       <row>
        <entry><literal><function>ts_token_type</function>(<replaceable class="PARAMETER">parser_oid</> <type>oid</>, OUT <replaceable class="PARAMETER">tokid</> <type>integer</>, OUT <replaceable class="PARAMETER">alias</> <type>text</>, OUT <replaceable class="PARAMETER">description</> <type>text</>)</literal></entry>
        <entry><type>setof record</type></entry>
        <entry>get token types defined by parser</entry>
        <entry><literal>ts_token_type(3722)</literal></entry>
-        <entry><literal>(1,lword,"Latin word") ...</literal></entry>
+        <entry><literal>(1,asciiword,"Word, all ASCII") ...</literal></entry>
       </row>
       <row>
        <entry><literal><function>ts_stat</function>(<replaceable class="PARAMETER">sqlquery</replaceable> <type>text</>, <optional> <replaceable class="PARAMETER">weights</replaceable> <type>text</>, </optional> OUT <replaceable class="PARAMETER">word</replaceable> <type>text</>, OUT <replaceable class="PARAMETER">ndoc</replaceable> <type>integer</>, OUT <replaceable class="PARAMETER">nentry</replaceable> <type>integer</>)</literal></entry>
--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.23 2007/10/22 20:13:37 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.24 2007/10/23 20:46:12 tgl Exp $ -->

 <chapter id="textsearch">
 <title id="textsearch-title">Full Text Search</title>
@ -1775,119 +1775,120 @@ LIMIT 10;
    </thead>
    <tbody>
     <row>
-      <entry>lword</entry>
-      <entry>Latin word (only ASCII letters)</entry>
+      <entry><literal>asciiword</></entry>
+      <entry>Word, all ASCII letters</entry>
      <entry><literal>foo</literal></entry>
     </row>
     <row>
-      <entry>nlword</entry>
-      <entry>Non-latin word (only non-ASCII letters)</entry>
-      <entry><literal></literal></entry>
+      <entry><literal>word</></entry>
+      <entry>Word, all letters</entry>
+      <entry><literal>f&oslash;&oslash;</literal></entry>
     </row>
     <row>
-      <entry>word</entry>
-      <entry>Word (other cases)</entry>
+      <entry><literal>numword</></entry>
+      <entry>Word, letters and digits</entry>
      <entry><literal>beta1</literal></entry>
     </row>
     <row>
-      <entry>lhword</entry>
-      <entry>Latin hyphenated word</entry>
+      <entry><literal>asciihword</></entry>
+      <entry>Hyphenated word, all ASCII</entry>
      <entry><literal>foo-bar</literal></entry>
     </row>
     <row>
-      <entry>nlhword</entry>
-      <entry>Non-latin hyphenated word</entry>
-      <entry><literal></literal></entry>
+      <entry><literal>hword</></entry>
+      <entry>Hyphenated word, all letters</entry>
+      <entry><literal>f&oslash;&oslash;-bar</literal></entry>
     </row>
     <row>
-      <entry>hword</entry>
-      <entry>Hyphenated word</entry>
+      <entry><literal>numhword</></entry>
+      <entry>Hyphenated word, letters and digits</entry>
      <entry><literal>foo-beta1</literal></entry>
     </row>
     <row>
-      <entry>lpart_hword</entry>
-      <entry>Latin part of hyphenated word</entry>
+      <entry><literal>hword_asciipart</></entry>
+      <entry>Hyphenated word part, all ASCII</entry>
      <entry><literal>foo</literal> or <literal>bar</literal> in the context
-       <literal>foo-bar</></entry>
+       <literal>foo-bar</literal></entry>
     </row>
     <row>
-      <entry>nlpart_hword</entry>
-      <entry>Non-latin part of hyphenated word</entry>
-      <entry><literal></literal></entry>
+      <entry><literal>hword_part</></entry>
+      <entry>Hyphenated word part, all letters</entry>
+      <entry><literal>f&oslash;&oslash;</literal> in the context
+       <literal>f&oslash;&oslash;-bar</literal></entry>
     </row>
     <row>
-      <entry>part_hword</entry>
-      <entry>Part of hyphenated word</entry>
+      <entry><literal>hword_numpart</></entry>
+      <entry>Hyphenated word part, letters and digits</entry>
      <entry><literal>beta1</literal> in the context
-       <literal>foo-beta1</></entry>
+       <literal>foo-beta1</literal></entry>
     </row>
     <row>
-      <entry>email</entry>
+      <entry><literal>email</></entry>
      <entry>Email address</entry>
      <entry><literal>foo@bar.com</literal></entry>
     </row>
     <row>
-      <entry>protocol</entry>
+      <entry><literal>protocol</></entry>
      <entry>Protocol head</entry>
      <entry><literal>http://</literal></entry>
     </row>
     <row>
-      <entry>url</entry>
+      <entry><literal>url</></entry>
      <entry>URL</entry>
      <entry><literal>foo.com/stuff/index.html</literal></entry>
     </row>
     <row>
-      <entry>host</entry>
+      <entry><literal>host</></entry>
      <entry>Host</entry>
      <entry><literal>foo.com</literal></entry>
     </row>
     <row>
-      <entry>uri</entry>
+      <entry><literal>uri</></entry>
      <entry>URI</entry>
      <entry><literal>/stuff/index.html</literal>, in the context of a URL</entry>
     </row>
     <row>
-      <entry>file</entry>
+      <entry><literal>file</></entry>
      <entry>File or path name</entry>
      <entry><literal>/usr/local/foo.txt</literal>, if not within a URL</entry>
     </row>
     <row>
-      <entry>sfloat</entry>
+      <entry><literal>sfloat</></entry>
      <entry>Scientific notation</entry>
      <entry><literal>-1.234e56</literal></entry>
     </row>
     <row>
-      <entry>float</entry>
+      <entry><literal>float</></entry>
      <entry>Decimal notation</entry>
      <entry><literal>-1.234</literal></entry>
     </row>
     <row>
-      <entry>int</entry>
+      <entry><literal>int</></entry>
      <entry>Signed integer</entry>
      <entry><literal>-1234</literal></entry>
     </row>
     <row>
-      <entry>uint</entry>
+      <entry><literal>uint</></entry>
      <entry>Unsigned integer</entry>
      <entry><literal>1234</literal></entry>
     </row>
     <row>
-      <entry>version</entry>
+      <entry><literal>version</></entry>
      <entry>Version number</entry>
      <entry><literal>8.3.0</literal></entry>
     </row>
     <row>
-      <entry>tag</entry>
-      <entry>HTML Tag</entry>
+      <entry><literal>tag</></entry>
+      <entry>HTML tag</entry>
      <entry><literal>&lt;A HREF="dictionaries.html"&gt;</literal></entry>
     </row>
     <row>
-      <entry>entity</entry>
-      <entry>HTML Entity</entry>
+      <entry><literal>entity</></entry>
+      <entry>HTML entity</entry>
      <entry><literal>&amp;amp;</literal></entry>
     </row>
     <row>
-      <entry>blank</entry>
+      <entry><literal>blank</></entry>
      <entry>Space symbols</entry>
      <entry>(any whitespace or punctuation not otherwise recognized)</entry>
     </row>
@ -1895,6 +1896,17 @@ LIMIT 10;
   </tgroup>
  </table>

+  <note>
+   <para>
+    The parser's notion of a <quote>letter</> is determined by the server's
+    locale setting, specifically <varname>lc_ctype</>.  Words containing
+    only the basic ASCII letters are reported as a separate token type,
+    since it is sometimes useful to distinguish them.  In most European
+    languages, token types <literal>word</> and <literal>asciiword</>
+    should always be treated alike.
+   </para>
+  </note>
+
  <para>
   It is possible for the parser to produce overlapping tokens from the same
   piece of text.  As an example, a hyphenated word will be reported both
@ -1903,13 +1915,13 @@ LIMIT 10;
 <programlisting>
 SELECT alias, description, token FROM ts_debug('foo-bar-beta1');
      alias      |               description                |     token     
-------------+-------------------------------+---------------
- hword       | Hyphenated word               | foo-bar-beta1
- lpart_hword | Latin part of hyphenated word | foo
+-----------------+------------------------------------------+---------------
+ numhword        | Hyphenated word, letters and digits      | foo-bar-beta1
+ hword_asciipart | Hyphenated word part, all ASCII          | foo
 blank           | Space symbols                            | -
- lpart_hword | Latin part of hyphenated word | bar
+ hword_asciipart | Hyphenated word part, all ASCII          | bar
 blank           | Space symbols                            | -
- part_hword  | Part of hyphenated word       | beta1
+ hword_numpart   | Hyphenated word part, letters and digits | beta1
 </programlisting>

   This behavior is desirable since it allows searches to work for both
@ -2045,13 +2057,13 @@ SELECT alias, description, token FROM ts_debug('http://foo.com/stuff/index.html'
   a <application>Snowball</> stemmer or <literal>simple</>, which
   recognizes everything.  For example, for an astronomy-specific search
   (<literal>astro_en</literal> configuration) one could bind token type
-   <type>lword</type> (Latin word) to a synonym dictionary of astronomical
+   <type>asciiword</type> (ASCII word) to a synonym dictionary of astronomical
   terms, a general English dictionary and a <application>Snowball</> English
   stemmer:

 <programlisting>
 ALTER TEXT SEARCH CONFIGURATION astro_en
-    ADD MAPPING FOR lword WITH astrosyn, english_ispell, english_stem;
+    ADD MAPPING FOR asciiword WITH astrosyn, english_ispell, english_stem;
 </programlisting>
  </para>

@ -2188,8 +2200,8 @@ SELECT ts_lexize('public.simple_dict','The');
 <programlisting>
 SELECT * FROM ts_debug('english', 'Paris');
   alias   |   description   | token |  dictionaries  |  dictionary  | lexemes 
-------+-------------+-------+----------------+--------------+---------
- lword | Latin word  | Paris | {english_stem} | english_stem | {pari}
+-----------+-----------------+-------+----------------+--------------+---------
+ asciiword | Word, all ASCII | Paris | {english_stem} | english_stem | {pari}

 CREATE TEXT SEARCH DICTIONARY my_synonym (
    TEMPLATE = synonym,
@ -2197,12 +2209,12 @@ CREATE TEXT SEARCH DICTIONARY my_synonym (
 );

 ALTER TEXT SEARCH CONFIGURATION english
-    ALTER MAPPING FOR lword WITH my_synonym, english_stem;
+    ALTER MAPPING FOR asciiword WITH my_synonym, english_stem;

 SELECT * FROM ts_debug('english', 'Paris');
   alias   |   description   | token |       dictionaries        | dictionary | lexemes 
-------+-------------+-------+---------------------------+------------+---------
- lword | Latin word  | Paris | {my_synonym,english_stem} | my_synonym | {paris}
+-----------+-----------------+-------+---------------------------+------------+---------
+ asciiword | Word, all ASCII | Paris | {my_synonym,english_stem} | my_synonym | {paris}
 </programlisting>
   </para>

@ -2293,7 +2305,7 @@ the one a two : swsw2
    uses these assignments to check if it should handle the next word or stop
    accumulation.  The thesaurus dictionary must be configured
    carefully. For example, if the thesaurus dictionary is assigned to handle
-    only the <literal>lword</literal> token, then a thesaurus dictionary
+    only the <literal>asciiword</literal> token, then a thesaurus dictionary
    definition like <literal>one 7</> will not work since token type
    <literal>uint</literal> is not assigned to the thesaurus dictionary.
   </para>
@ -2353,7 +2365,7 @@ CREATE TEXT SEARCH DICTIONARY thesaurus_simple (

 <programlisting>
 ALTER TEXT SEARCH CONFIGURATION russian
-    ADD MAPPING FOR lword, lhword, lpart_hword WITH thesaurus_simple;
+    ADD MAPPING FOR asciiword, asciihword, hword_asciipart WITH thesaurus_simple;
 </programlisting>
   </para>

@ -2382,7 +2394,7 @@ CREATE TEXT SEARCH DICTIONARY thesaurus_astro (
 );

 ALTER TEXT SEARCH CONFIGURATION russian
-    ADD MAPPING FOR lword, lhword, lpart_hword WITH thesaurus_astro, english_stem;
+    ADD MAPPING FOR asciiword, asciihword, hword_asciipart WITH thesaurus_astro, english_stem;
 </programlisting>

    Now we can see how it works.
@ -2633,12 +2645,13 @@ CREATE TEXT SEARCH DICTIONARY english_ispell (
 );
 </programlisting>

-    Now we can set up the mappings for Latin words for configuration
+    Now we can set up the mappings for words in configuration
    <literal>pg</>:

 <programlisting>
 ALTER TEXT SEARCH CONFIGURATION pg
-    ALTER MAPPING FOR lword, lhword, lpart_hword
+    ALTER MAPPING FOR asciiword, asciihword, hword_asciipart,
+                      word, hword, hword_part
    WITH pg_dict, english_ispell, english_stem;
 </programlisting>

@ -2779,31 +2792,31 @@ SHOW default_text_search_config;
 <programlisting>
 SELECT * FROM ts_debug('english','a fat  cat sat on a mat - it ate a fat rats');
   alias   |   description   | token |  dictionaries  |  dictionary  | lexemes 
-------+---------------+-------+----------------+--------------+---------
- lword | Latin word    | a     | {english_stem} | english_stem | {}
+-----------+-----------------+-------+----------------+--------------+---------
+ asciiword | Word, all ASCII | a     | {english_stem} | english_stem | {}
 blank     | Space symbols   |       | {}             |              | 
- lword | Latin word    | fat   | {english_stem} | english_stem | {fat}
+ asciiword | Word, all ASCII | fat   | {english_stem} | english_stem | {fat}
 blank     | Space symbols   |       | {}             |              | 
- lword | Latin word    | cat   | {english_stem} | english_stem | {cat}
+ asciiword | Word, all ASCII | cat   | {english_stem} | english_stem | {cat}
 blank     | Space symbols   |       | {}             |              | 
- lword | Latin word    | sat   | {english_stem} | english_stem | {sat}
+ asciiword | Word, all ASCII | sat   | {english_stem} | english_stem | {sat}
 blank     | Space symbols   |       | {}             |              | 
- lword | Latin word    | on    | {english_stem} | english_stem | {}
+ asciiword | Word, all ASCII | on    | {english_stem} | english_stem | {}
 blank     | Space symbols   |       | {}             |              | 
- lword | Latin word    | a     | {english_stem} | english_stem | {}
+ asciiword | Word, all ASCII | a     | {english_stem} | english_stem | {}
 blank     | Space symbols   |       | {}             |              | 
- lword | Latin word    | mat   | {english_stem} | english_stem | {mat}
+ asciiword | Word, all ASCII | mat   | {english_stem} | english_stem | {mat}
 blank     | Space symbols   |       | {}             |              | 
 blank     | Space symbols   | -     | {}             |              | 
- lword | Latin word    | it    | {english_stem} | english_stem | {}
+ asciiword | Word, all ASCII | it    | {english_stem} | english_stem | {}
 blank     | Space symbols   |       | {}             |              | 
- lword | Latin word    | ate   | {english_stem} | english_stem | {ate}
+ asciiword | Word, all ASCII | ate   | {english_stem} | english_stem | {ate}
 blank     | Space symbols   |       | {}             |              | 
- lword | Latin word    | a     | {english_stem} | english_stem | {}
+ asciiword | Word, all ASCII | a     | {english_stem} | english_stem | {}
 blank     | Space symbols   |       | {}             |              | 
- lword | Latin word    | fat   | {english_stem} | english_stem | {fat}
+ asciiword | Word, all ASCII | fat   | {english_stem} | english_stem | {fat}
 blank     | Space symbols   |       | {}             |              | 
- lword | Latin word    | rats  | {english_stem} | english_stem | {rat}
+ asciiword | Word, all ASCII | rats  | {english_stem} | english_stem | {rat}
 </programlisting>
  </para>

@ -2824,23 +2837,23 @@ CREATE TEXT SEARCH DICTIONARY english_ispell (
 );

 ALTER TEXT SEARCH CONFIGURATION public.english
-   ALTER MAPPING FOR lword WITH english_ispell, english_stem;
+   ALTER MAPPING FOR asciiword WITH english_ispell, english_stem;
 </programlisting>

 <programlisting>
 SELECT * FROM ts_debug('public.english','The Brightest supernovaes');
   alias   |   description   |    token    |         dictionaries          |   dictionary   |   lexemes   
-------+---------------+-------------+-------------------------------+----------------+-------------
- lword | Latin word    | The         | {english_ispell,english_stem} | english_ispell | {}
+-----------+-----------------+-------------+-------------------------------+----------------+-------------
+ asciiword | Word, all ASCII | The         | {english_ispell,english_stem} | english_ispell | {}
 blank     | Space symbols   |             | {}                            |                | 
- lword | Latin word    | Brightest   | {english_ispell,english_stem} | english_ispell | {bright}
+ asciiword | Word, all ASCII | Brightest   | {english_ispell,english_stem} | english_ispell | {bright}
 blank     | Space symbols   |             | {}                            |                | 
- lword | Latin word    | supernovaes | {english_ispell,english_stem} | english_stem   | {supernova}
+ asciiword | Word, all ASCII | supernovaes | {english_ispell,english_stem} | english_stem   | {supernova}
 </programlisting>

  <para>
   In this example, the word <literal>Brightest</> was recognized by the
-   parser as a <literal>Latin word</literal> (alias <literal>lword</literal>).
+   parser as an <literal>ASCII word</literal> (alias <literal>asciiword</literal>).
   For this token type the dictionary list is
   <literal>english_ispell</> and
   <literal>english_stem</literal>. The word was recognized by
@ -2869,12 +2882,12 @@ SELECT * FROM ts_debug('public.english','The Brightest supernovaes');
 SELECT alias, token, dictionary, lexemes
 FROM ts_debug('public.english','The Brightest supernovaes');
   alias   |    token    |   dictionary   |   lexemes   
-------+-------------+----------------+-------------
- lword | The         | english_ispell | {}
+-----------+-------------+----------------+-------------
+ asciiword | The         | english_ispell | {}
 blank     |             |                | 
- lword | Brightest   | english_ispell | {bright}
+ asciiword | Brightest   | english_ispell | {bright}
 blank     |             |                | 
- lword | supernovaes | english_stem   | {supernova}
+ asciiword | supernovaes | english_stem   | {supernova}
 </programlisting>
  </para>

@ -2936,30 +2949,30 @@ SELECT * FROM ts_parse('default', '123 - a number');
 <programlisting>
 SELECT * FROM ts_token_type('default');
 tokid |      alias      |               description                
-------+--------------+-----------------------------------
-     1 | lword        | Latin word
-     2 | nlword       | Non-latin word
-     3 | word         | Word
-     4 | email        | Email
+-------+-----------------+------------------------------------------
+     1 | asciiword       | Word, all ASCII
+     2 | word            | Word, all letters
+     3 | numword         | Word, letters and digits
+     4 | email           | Email address
     5 | url             | URL
     6 | host            | Host
     7 | sfloat          | Scientific notation
-     8 | version      | VERSION
-     9 | part_hword   | Part of hyphenated word
-    10 | nlpart_hword | Non-latin part of hyphenated word
-    11 | lpart_hword  | Latin part of hyphenated word
+     8 | version         | Version number
+     9 | hword_numpart   | Hyphenated word part, letters and digits
+    10 | hword_part      | Hyphenated word part, all letters
+    11 | hword_asciipart | Hyphenated word part, all ASCII
    12 | blank           | Space symbols
-    13 | tag          | HTML Tag
+    13 | tag             | HTML tag
    14 | protocol        | Protocol head
-    15 | hword        | Hyphenated word
-    16 | lhword       | Latin hyphenated word
-    17 | nlhword      | Non-latin hyphenated word
+    15 | numhword        | Hyphenated word, letters and digits
+    16 | asciihword      | Hyphenated word, all ASCII
+    17 | hword           | Hyphenated word, all letters
    18 | uri             | URI
    19 | file            | File or path name
    20 | float           | Decimal notation
    21 | int             | Signed integer
    22 | uint            | Unsigned integer
-    23 | entity       | HTML Entity
+    23 | entity          | HTML entity
 </programlisting>
   </para>

@ -3305,20 +3318,20 @@ EXPLAIN SELECT * FROM apod WHERE textsearch @@ to_tsquery('supernovae');
 Text search configuration "pg_catalog.russian"
 Parser: "pg_catalog.default"
      Token      | Dictionaries 
--------------+--------------
+-----------------+--------------
+ asciihword      | english_stem
+ asciiword       | english_stem
 email           | simple
 file            | simple
 float           | simple
 host            | simple
 hword           | russian_stem
+ hword_asciipart | english_stem
+ hword_numpart   | simple
+ hword_part      | russian_stem
 int             | simple
- lhword       | english_stem
- lpart_hword  | english_stem
- lword        | english_stem
- nlhword      | russian_stem
- nlpart_hword | russian_stem
- nlword       | russian_stem
- part_hword   | russian_stem
+ numhword        | simple
+ numword         | simple
 sfloat          | simple
 uint            | simple
 uri             | simple
@ -3391,30 +3404,30 @@ Parser: "pg_catalog.default"

        Token types for parser "pg_catalog.default"
   Token name    |               Description                
--------------+-----------------------------------
+-----------------+------------------------------------------
+ asciihword      | Hyphenated word, all ASCII
+ asciiword       | Word, all ASCII
 blank           | Space symbols
- email        | Email
- entity       | HTML Entity
+ email           | Email address
+ entity          | HTML entity
 file            | File or path name
 float           | Decimal notation
 host            | Host
- hword        | Hyphenated word
+ hword           | Hyphenated word, all letters
+ hword_asciipart | Hyphenated word part, all ASCII
+ hword_numpart   | Hyphenated word part, letters and digits
+ hword_part      | Hyphenated word part, all letters
 int             | Signed integer
- lhword       | Latin hyphenated word
- lpart_hword  | Latin part of hyphenated word
- lword        | Latin word
- nlhword      | Non-latin hyphenated word
- nlpart_hword | Non-latin part of hyphenated word
- nlword       | Non-latin word
- part_hword   | Part of hyphenated word
+ numhword        | Hyphenated word, letters and digits
+ numword         | Word, letters and digits
 protocol        | Protocol head
 sfloat          | Scientific notation
- tag          | HTML Tag
+ tag             | HTML tag
 uint            | Unsigned integer
 uri             | URI
 url             | URL
- version      | VERSION
- word         | Word
+ version         | Version number
+ word            | Word, all letters
 (23 rows)
 </programlisting>
     </para>
--- a/src/backend/snowball/Makefile
+++ b/src/backend/snowball/Makefile
@ -2,7 +2,7 @@
 #
 # Makefile for src/backend/snowball
 #
-# $PostgreSQL: pgsql/src/backend/snowball/Makefile,v 1.3 2007/08/27 10:29:49 mha Exp $
+# $PostgreSQL: pgsql/src/backend/snowball/Makefile,v 1.4 2007/10/23 20:46:12 tgl Exp $
 #
 #-------------------------------------------------------------------------

@ -46,8 +46,9 @@ OBJS= dict_snowball.o api.o utilities.o \
 	stem_UTF_8_swedish.o \
 	stem_UTF_8_turkish.o

-# second column is name of latin dictionary, if different
-# Note order dependency: use of some other language as latin dictionary
+# first column is language name and also name of dictionary for not-all-ASCII
+# words, second is name of dictionary for all-ASCII words
+# Note order dependency: use of some other language as ASCII dictionary
 # must come after creation of that language
 LANGUAGES=  \
 	danish		danish 		\
@ -95,8 +96,8 @@ ifeq ($(enable_shared), yes)
 	while [ "$$#" -gt 0 ] ; \
 	do \
 		lang=$$1; shift; \
-		nonlatdictname=$$lang; \
-		latdictname=$$1; shift; \
+		nonascdictname=$$lang; \
+		ascdictname=$$1; shift; \
 		if [ -s $(srcdir)/stopwords/$${lang}.stop ] ; then \
 			stop=", StopWords=$${lang}" ; \
 		else \
@ -106,8 +107,8 @@ ifeq ($(enable_shared), yes)
 			sed -e "s#_LANGNAME_#$$lang#g" | \
 			sed -e "s#_DICTNAME_#$${lang}_stem#g" | \
 			sed -e "s#_CFGNAME_#$$lang#g" | \
-			sed -e "s#_LATDICTNAME_#$${latdictname}_stem#g" | \
-			sed -e "s#_NONLATDICTNAME_#$${nonlatdictname}_stem#g" | \
+			sed -e "s#_ASCDICTNAME_#$${ascdictname}_stem#g" | \
+			sed -e "s#_NONASCDICTNAME_#$${nonascdictname}_stem#g" | \
 			sed -e "s#_STOPWORDS_#$$stop#g" ; \
 	done >> $@
 else
--- a/src/backend/snowball/snowball.sql.in
+++ b/src/backend/snowball/snowball.sql.in
@ -1,4 +1,4 @@
-- $PostgreSQL: pgsql/src/backend/snowball/snowball.sql.in,v 1.4 2007/09/03 02:30:43 tgl Exp $$
+-- $PostgreSQL: pgsql/src/backend/snowball/snowball.sql.in,v 1.5 2007/10/23 20:46:12 tgl Exp $$

 -- text search configuration for _LANGNAME_ language
 CREATE TEXT SEARCH DICTIONARY _DICTNAME_
@ -12,14 +12,15 @@ CREATE TEXT SEARCH CONFIGURATION _CFGNAME_
 COMMENT ON TEXT SEARCH CONFIGURATION _CFGNAME_ IS 'configuration for _LANGNAME_ language';

 ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
-	FOR email, url, host, sfloat, version, uri, file, float, int, uint
+	FOR email, url, host, sfloat, version, uri, file, float, int, uint,
+            numword, hword_numpart, numhword
 	WITH simple;

 ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
-    FOR lhword, lpart_hword, lword
-	WITH _LATDICTNAME_;
+    FOR asciiword, hword_asciipart, asciihword
+	WITH _ASCDICTNAME_;

 ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
-    FOR hword, nlhword, nlpart_hword, nlword, word, part_hword
-	WITH _NONLATDICTNAME_;
+    FOR word, hword_part, hword
+	WITH _NONASCDICTNAME_;

--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@ -1,13 +1,13 @@
 /*-------------------------------------------------------------------------
 *
 * wparser_def.c
- *		Standard word parser
+ *		Default text search parser
 *
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.3 2007/09/07 15:09:55 teodor Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.4 2007/10/23 20:46:12 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -22,26 +22,25 @@
 #include "utils/builtins.h"


-/* rememder !!!! */
-#define LASTNUM		23
+/* Output token categories */

-#define LATWORD		1
-#define CYRWORD		2
-#define UWORD		3
+#define ASCIIWORD		1
+#define WORD_T			2
+#define NUMWORD			3
 #define EMAIL			4
-#define FURL		5
+#define URL_T			5
 #define HOST			6
 #define SCIENTIFIC		7
 #define VERSIONNUMBER	8
-#define PARTHYPHENWORD	9
-#define CYRPARTHYPHENWORD	10
-#define LATPARTHYPHENWORD	11
+#define NUMPARTHWORD	9
+#define PARTHWORD		10
+#define ASCIIPARTHWORD	11
 #define SPACE			12
-#define TAG			13
+#define TAG_T			13
 #define PROTOCOL		14
-#define HYPHENWORD	15
-#define LATHYPHENWORD	16
-#define CYRHYPHENWORD	17
+#define NUMHWORD		15
+#define ASCIIHWORD		16
+#define HWORD			17
 #define URI				18
 #define FILEPATH		19
 #define DECIMAL			20
@ -49,52 +48,27 @@
 #define UNSIGNEDINT 	22
 #define HTMLENTITY		23

-static const char *lex_descr[] = {
-	"",
-	"Latin word",
-	"Non-latin word",
-	"Word",
-	"Email",
-	"URL",
-	"Host",
-	"Scientific notation",
-	"VERSION",
-	"Part of hyphenated word",
-	"Non-latin part of hyphenated word",
-	"Latin part of hyphenated word",
-	"Space symbols",
-	"HTML Tag",
-	"Protocol head",
-	"Hyphenated word",
-	"Latin hyphenated word",
-	"Non-latin hyphenated word",
-	"URI",
-	"File or path name",
-	"Decimal notation",
-	"Signed integer",
-	"Unsigned integer",
-	"HTML Entity"
-};
+#define LASTNUM			23

-static const char *tok_alias[] = {
+static const char * const tok_alias[] = {
 	"",
-	"lword",
-	"nlword",
+	"asciiword",
 	"word",
+	"numword",
 	"email",
 	"url",
 	"host",
 	"sfloat",
 	"version",
-	"part_hword",
-	"nlpart_hword",
-	"lpart_hword",
+	"hword_numpart",
+	"hword_part",
+	"hword_asciipart",
 	"blank",
 	"tag",
 	"protocol",
+	"numhword",
+	"asciihword",
 	"hword",
-	"lhword",
-	"nlhword",
 	"uri",
 	"file",
 	"float",
@ -103,12 +77,42 @@ static const char *tok_alias[] = {
 	"entity"
 };

+static const char * const lex_descr[] = {
+	"",
+	"Word, all ASCII",
+	"Word, all letters",
+	"Word, letters and digits",
+	"Email address",
+	"URL",
+	"Host",
+	"Scientific notation",
+	"Version number",
+	"Hyphenated word part, letters and digits",
+	"Hyphenated word part, all letters",
+	"Hyphenated word part, all ASCII",
+	"Space symbols",
+	"HTML tag",
+	"Protocol head",
+	"Hyphenated word, letters and digits",
+	"Hyphenated word, all ASCII",
+	"Hyphenated word, all letters",
+	"URI",
+	"File or path name",
+	"Decimal notation",
+	"Signed integer",
+	"Unsigned integer",
+	"HTML entity"
+};
+
+
+/* Parser states */
+
 typedef enum
 {
 	TPS_Base = 0,
-	TPS_InUWord,
-	TPS_InLatWord,
-	TPS_InCyrWord,
+	TPS_InNumWord,
+	TPS_InAsciiWord,
+	TPS_InWord,
 	TPS_InUnsignedInt,
 	TPS_InSignedIntFirst,
 	TPS_InSignedInt,
@ -167,20 +171,20 @@ typedef enum
 	TPS_InProtocolFirst,
 	TPS_InProtocolSecond,
 	TPS_InProtocolEnd,
-	TPS_InHyphenLatWordFirst,
-	TPS_InHyphenLatWord,
-	TPS_InHyphenCyrWordFirst,
-	TPS_InHyphenCyrWord,
-	TPS_InHyphenUWordFirst,
-	TPS_InHyphenUWord,
+	TPS_InHyphenAsciiWordFirst,
+	TPS_InHyphenAsciiWord,
+	TPS_InHyphenWordFirst,
+	TPS_InHyphenWord,
+	TPS_InHyphenNumWordFirst,
+	TPS_InHyphenNumWord,
 	TPS_InHyphenValueFirst,
 	TPS_InHyphenValue,
 	TPS_InHyphenValueExact,
 	TPS_InParseHyphen,
 	TPS_InParseHyphenHyphen,
-	TPS_InHyphenCyrWordPart,
-	TPS_InHyphenLatWordPart,
-	TPS_InHyphenUWordPart,
+	TPS_InHyphenWordPart,
+	TPS_InHyphenAsciiWordPart,
+	TPS_InHyphenNumWordPart,
 	TPS_InHyphenUnsignedInt,
 	TPS_InHDecimalPartFirst,
 	TPS_InHDecimalPart,
@ -192,7 +196,6 @@ typedef enum
 /* forward declaration */
 struct TParser;

-
 typedef int (*TParserCharTest) (struct TParser *);		/* any p_is* functions
 														 * except p_iseq */
 typedef void (*TParserSpecial) (struct TParser *);		/* special handler for
@ -208,6 +211,16 @@ typedef struct
 	TParserSpecial special;
 } TParserStateActionItem;

+/* Flag bits in TParserStateActionItem.flags */
+#define A_NEXT		0x0000
+#define A_BINGO		0x0001
+#define A_POP		0x0002
+#define A_PUSH		0x0004
+#define A_RERUN		0x0008
+#define A_CLEAR		0x0010
+#define A_MERGE		0x0020
+#define A_CLRALL	0x0040
+
 typedef struct
 {
 	TParserState state;
@ -255,6 +268,11 @@ typedef struct TParser

 } TParser;

+
+/* forward decls here */
+static bool TParserGet(TParser * prs);
+
+
 static TParserPosition *
 newTParserPosition(TParserPosition * prev)
 {
@ -303,8 +321,6 @@ TParserInit(char *str, int len)
 	return prs;
 }

-static bool TParserGet(TParser * prs);
-
 static void
 TParserClose(TParser * prs)
 {
@ -325,10 +341,10 @@ TParserClose(TParser * prs)
 }

 /*
- * defining support function, equvalent is* macroses, but
+ * Character-type support functions, equivalent to is* macros, but
 * working with any possible encodings and locales. Note,
 * that with multibyte encoding and C-locale isw* function may fail
- * or give wrong result. Note 2: multibyte encoding and C-local
+ * or give wrong result. Note 2: multibyte encoding and C-locale
 * often are used for Asian languages
 */

@ -487,17 +503,13 @@ p_isascii(TParser * prs)
 }

 static int
-p_islatin(TParser * prs)
+p_isasclet(TParser * prs)
 {
-	return (p_isalpha(prs) && p_isascii(prs)) ? 1 : 0;
+	return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
 }

-static int
-p_isnonlatin(TParser * prs)
-{
-	return (p_isalpha(prs) && !p_isascii(prs)) ? 1 : 0;
-}

+/* deliberately suppress unused-function complaints for the above */
 void		_make_compiler_happy(void);
 void
 _make_compiler_happy(void)
@ -638,21 +650,12 @@ p_isURI(TParser * prs)
 * Table of state/action of parser
 */

-#define A_NEXT		0x0000
-#define A_BINGO		0x0001
-#define A_POP		0x0002
-#define A_PUSH		0x0004
-#define A_RERUN		0x0008
-#define A_CLEAR		0x0010
-#define A_MERGE		0x0020
-#define A_CLRALL	0x0040
-
 static TParserStateActionItem actionTPS_Base[] = {
 	{p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
 	{p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
 	{p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
-	{p_islatin, 0, A_NEXT, TPS_InLatWord, 0, NULL},
-	{p_isnonlatin, 0, A_NEXT, TPS_InCyrWord, 0, NULL},
+	{p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
+	{p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
 	{p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
 	{p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
 	{p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
@ -664,37 +667,38 @@ static TParserStateActionItem actionTPS_Base[] = {
 };


-static TParserStateActionItem actionTPS_InUWord[] = {
-	{p_isEOF, 0, A_BINGO, TPS_Base, UWORD, NULL},
-	{p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL},
+static TParserStateActionItem actionTPS_InNumWord[] = {
+	{p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
+	{p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
 	{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
 	{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
 	{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
-	{p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL},
-	{NULL, 0, A_BINGO, TPS_Base, UWORD, NULL}
+	{p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
+	{NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
 };

-static TParserStateActionItem actionTPS_InLatWord[] = {
-	{p_isEOF, 0, A_BINGO, TPS_Base, LATWORD, NULL},
-	{p_islatin, 0, A_NEXT, TPS_Null, 0, NULL},
+static TParserStateActionItem actionTPS_InAsciiWord[] = {
+	{p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
+	{p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
 	{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
 	{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
 	{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
-	{p_iseqC, '-', A_PUSH, TPS_InHyphenLatWordFirst, 0, NULL},
+	{p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
 	{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
 	{p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
 	{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
 	{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
-	{p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL},
-	{NULL, 0, A_BINGO, TPS_Base, LATWORD, NULL}
+	{p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
+	{p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
+	{NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
 };

-static TParserStateActionItem actionTPS_InCyrWord[] = {
-	{p_isEOF, 0, A_BINGO, TPS_Base, CYRWORD, NULL},
-	{p_isnonlatin, 0, A_NEXT, TPS_Null, 0, NULL},
-	{p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL},
-	{p_iseqC, '-', A_PUSH, TPS_InHyphenCyrWordFirst, 0, NULL},
-	{NULL, 0, A_BINGO, TPS_Base, CYRWORD, NULL}
+static TParserStateActionItem actionTPS_InWord[] = {
+	{p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
+	{p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
+	{p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
+	{p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
+	{NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
 };

 static TParserStateActionItem actionTPS_InUnsignedInt[] = {
@ -704,8 +708,8 @@ static TParserStateActionItem actionTPS_InUnsignedInt[] = {
 	{p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
 	{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
 	{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
-	{p_islatin, 0, A_PUSH, TPS_InHost, 0, NULL},
-	{p_isalpha, 0, A_NEXT, TPS_InUWord, 0, NULL},
+	{p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
+	{p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
 	{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
 	{NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
 };
@ -816,13 +820,13 @@ static TParserStateActionItem actionTPS_InMantissa[] = {
 static TParserStateActionItem actionTPS_InHTMLEntityFirst[] = {
 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
 	{p_iseqC, '#', A_NEXT, TPS_InHTMLEntityNumFirst, 0, NULL},
-	{p_islatin, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
+	{p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
 };

 static TParserStateActionItem actionTPS_InHTMLEntity[] = {
 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
-	{p_islatin, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
+	{p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
 	{p_iseqC, ';', A_NEXT, TPS_InHTMLEntityEnd, 0, NULL},
 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
 };
@ -849,7 +853,7 @@ static TParserStateActionItem actionTPS_InTagFirst[] = {
 	{p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
 	{p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
 	{p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
-	{p_islatin, 0, A_PUSH, TPS_InTagName, 0, NULL},
+	{p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
 };

@ -863,7 +867,7 @@ static TParserStateActionItem actionTPS_InXMLBegin[] = {

 static TParserStateActionItem actionTPS_InTagCloseFirst[] = {
 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
-	{p_islatin, 0, A_NEXT, TPS_InTagName, 0, NULL},
+	{p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
 };

@ -873,7 +877,7 @@ static TParserStateActionItem actionTPS_InTagName[] = {
 	{p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
 	{p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
 	{p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
-	{p_islatin, 0, A_NEXT, TPS_Null, 0, NULL},
+	{p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
 };

@ -888,7 +892,7 @@ static TParserStateActionItem actionTPS_InTag[] = {
 	{p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
 	{p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
 	{p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
-	{p_islatin, 0, A_NEXT, TPS_Null, 0, NULL},
+	{p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
 	{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
 	{p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
 	{p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
@ -924,7 +928,7 @@ static TParserStateActionItem actionTPS_InTagBackSleshed[] = {
 };

 static TParserStateActionItem actionTPS_InTagEnd[] = {
-	{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG, NULL}
+	{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
 };

 static TParserStateActionItem actionTPS_InCommentFirst[] = {
@ -962,19 +966,19 @@ static TParserStateActionItem actionTPS_InCloseCommentLast[] = {
 };

 static TParserStateActionItem actionTPS_InCommentEnd[] = {
-	{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG, NULL}
+	{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
 };

 static TParserStateActionItem actionTPS_InHostFirstDomain[] = {
 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
-	{p_islatin, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
+	{p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
 	{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
 };

 static TParserStateActionItem actionTPS_InHostDomainSecond[] = {
 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
-	{p_islatin, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
+	{p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
 	{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
 	{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
 	{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
@ -984,7 +988,7 @@ static TParserStateActionItem actionTPS_InHostDomainSecond[] = {

 static TParserStateActionItem actionTPS_InHostDomain[] = {
 	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
-	{p_islatin, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
+	{p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
 	{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
 	{p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
 	{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
@ -1013,14 +1017,14 @@ static TParserStateActionItem actionTPS_InPort[] = {
 static TParserStateActionItem actionTPS_InHostFirstAN[] = {
 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
 	{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
-	{p_islatin, 0, A_NEXT, TPS_InHost, 0, NULL},
+	{p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
 };

 static TParserStateActionItem actionTPS_InHost[] = {
 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
 	{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
-	{p_islatin, 0, A_NEXT, TPS_InHost, 0, NULL},
+	{p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
 	{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
 	{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
 	{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
@ -1034,7 +1038,7 @@ static TParserStateActionItem actionTPS_InEmail[] = {

 static TParserStateActionItem actionTPS_InFileFirst[] = {
 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
-	{p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL},
+	{p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
 	{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
 	{p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
 	{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
@ -1045,7 +1049,7 @@ static TParserStateActionItem actionTPS_InFileFirst[] = {

 static TParserStateActionItem actionTPS_InFileTwiddle[] = {
 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
-	{p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL},
+	{p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
 	{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
 	{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
 	{p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
@ -1054,7 +1058,7 @@ static TParserStateActionItem actionTPS_InFileTwiddle[] = {

 static TParserStateActionItem actionTPS_InPathFirst[] = {
 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
-	{p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL},
+	{p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
 	{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
 	{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
 	{p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
@ -1079,7 +1083,7 @@ static TParserStateActionItem actionTPS_InPathSecond[] = {

 static TParserStateActionItem actionTPS_InFile[] = {
 	{p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
-	{p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL},
+	{p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
 	{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
 	{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
 	{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
@ -1091,7 +1095,7 @@ static TParserStateActionItem actionTPS_InFile[] = {

 static TParserStateActionItem actionTPS_InFileNext[] = {
 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
-	{p_islatin, 0, A_CLEAR, TPS_InFile, 0, NULL},
+	{p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
 	{p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
 	{p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
@ -1119,7 +1123,7 @@ static TParserStateActionItem actionTPS_InURI[] = {

 static TParserStateActionItem actionTPS_InFURL[] = {
 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
-	{p_isURI, 0, A_BINGO | A_CLRALL, TPS_Base, FURL, SpecialFURL},
+	{p_isURI, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
 };

@ -1139,54 +1143,52 @@ static TParserStateActionItem actionTPS_InProtocolEnd[] = {
 	{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
 };

-static TParserStateActionItem actionTPS_InHyphenLatWordFirst[] = {
+static TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
-	{p_islatin, 0, A_NEXT, TPS_InHyphenLatWord, 0, NULL},
-	{p_isnonlatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
+	{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
+	{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
 	{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
-	{p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
+	{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
 };

-static TParserStateActionItem actionTPS_InHyphenLatWord[] = {
-	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, LATHYPHENWORD, SpecialHyphen},
-	{p_islatin, 0, A_NEXT, TPS_InHyphenLatWord, 0, NULL},
-	{p_isnonlatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
-	{p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
-	{p_iseqC, '-', A_PUSH, TPS_InHyphenLatWordFirst, 0, NULL},
-	{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, LATHYPHENWORD, SpecialHyphen}
+static TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
+	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
+	{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
+	{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
+	{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+	{p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
+	{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
 };

-static TParserStateActionItem actionTPS_InHyphenCyrWordFirst[] = {
+static TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
-	{p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWord, 0, NULL},
-	{p_islatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
+	{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
 	{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
-	{p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
+	{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
 };

-static TParserStateActionItem actionTPS_InHyphenCyrWord[] = {
-	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, CYRHYPHENWORD, SpecialHyphen},
-	{p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWord, 0, NULL},
-	{p_islatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
-	{p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
-	{p_iseqC, '-', A_PUSH, TPS_InHyphenCyrWordFirst, 0, NULL},
-	{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, CYRHYPHENWORD, SpecialHyphen}
+static TParserStateActionItem actionTPS_InHyphenWord[] = {
+	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
+	{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
+	{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+	{p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
+	{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
 };

-static TParserStateActionItem actionTPS_InHyphenUWordFirst[] = {
+static TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
 	{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
-	{p_isalnum, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
+	{p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
 };

-static TParserStateActionItem actionTPS_InHyphenUWord[] = {
-	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen},
-	{p_isalnum, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
-	{p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL},
-	{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen}
+static TParserStateActionItem actionTPS_InHyphenNumWord[] = {
+	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
+	{p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+	{p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
+	{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
 };

 static TParserStateActionItem actionTPS_InHyphenValueFirst[] = {
@ -1196,26 +1198,26 @@ static TParserStateActionItem actionTPS_InHyphenValueFirst[] = {
 };

 static TParserStateActionItem actionTPS_InHyphenValue[] = {
-	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen},
+	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
 	{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
 	{p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL},
-	{p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL},
-	{p_isalpha, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
-	{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen}
+	{p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
+	{p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+	{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
 };

 static TParserStateActionItem actionTPS_InHyphenValueExact[] = {
-	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen},
+	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
 	{p_isdigit, 0, A_NEXT, TPS_InHyphenValueExact, 0, NULL},
 	{p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL},
-	{p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL},
-	{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen}
+	{p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
+	{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
 };

 static TParserStateActionItem actionTPS_InParseHyphen[] = {
 	{p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
-	{p_islatin, 0, A_NEXT, TPS_InHyphenLatWordPart, 0, NULL},
-	{p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWordPart, 0, NULL},
+	{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
+	{p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
 	{p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL},
 	{p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
 	{NULL, 0, A_RERUN, TPS_Base, 0, NULL}
@ -1227,32 +1229,31 @@ static TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
 };

-static TParserStateActionItem actionTPS_InHyphenCyrWordPart[] = {
-	{p_isEOF, 0, A_BINGO, TPS_Base, CYRPARTHYPHENWORD, NULL},
-	{p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWordPart, 0, NULL},
-	{p_islatin, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
-	{p_isdigit, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
-	{NULL, 0, A_BINGO, TPS_InParseHyphen, CYRPARTHYPHENWORD, NULL}
+static TParserStateActionItem actionTPS_InHyphenWordPart[] = {
+	{p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
+	{p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
+	{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
+	{NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
 };

-static TParserStateActionItem actionTPS_InHyphenLatWordPart[] = {
-	{p_isEOF, 0, A_BINGO, TPS_Base, LATPARTHYPHENWORD, NULL},
-	{p_islatin, 0, A_NEXT, TPS_InHyphenLatWordPart, 0, NULL},
-	{p_isnonlatin, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
-	{p_isdigit, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
-	{NULL, 0, A_BINGO, TPS_InParseHyphen, LATPARTHYPHENWORD, NULL}
+static TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
+	{p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
+	{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
+	{p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
+	{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
+	{NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
 };

-static TParserStateActionItem actionTPS_InHyphenUWordPart[] = {
-	{p_isEOF, 0, A_BINGO, TPS_Base, PARTHYPHENWORD, NULL},
-	{p_isalnum, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
-	{NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHYPHENWORD, NULL}
+static TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
+	{p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
+	{p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
+	{NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
 };

 static TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
 	{p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
 	{p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL},
-	{p_isalpha, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
+	{p_isalpha, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
 	{p_iseqC, '.', A_PUSH, TPS_InHDecimalPartFirst, 0, NULL},
 	{NULL, 0, A_BINGO, TPS_InParseHyphen, UNSIGNEDINT, NULL}
 };
@ -1284,14 +1285,14 @@ static TParserStateActionItem actionTPS_InHVersionPart[] = {
 };

 /*
- * order should be the same as in typedef enum {} TParserState!!
+ * order must be the same as in typedef enum {} TParserState!!
 */

 static const TParserStateAction Actions[] = {
 	{TPS_Base, actionTPS_Base},
-	{TPS_InUWord, actionTPS_InUWord},
-	{TPS_InLatWord, actionTPS_InLatWord},
-	{TPS_InCyrWord, actionTPS_InCyrWord},
+	{TPS_InNumWord, actionTPS_InNumWord},
+	{TPS_InAsciiWord, actionTPS_InAsciiWord},
+	{TPS_InWord, actionTPS_InWord},
 	{TPS_InUnsignedInt, actionTPS_InUnsignedInt},
 	{TPS_InSignedIntFirst, actionTPS_InSignedIntFirst},
 	{TPS_InSignedInt, actionTPS_InSignedInt},
@ -1350,20 +1351,20 @@ static const TParserStateAction Actions[] = {
 	{TPS_InProtocolFirst, actionTPS_InProtocolFirst},
 	{TPS_InProtocolSecond, actionTPS_InProtocolSecond},
 	{TPS_InProtocolEnd, actionTPS_InProtocolEnd},
-	{TPS_InHyphenLatWordFirst, actionTPS_InHyphenLatWordFirst},
-	{TPS_InHyphenLatWord, actionTPS_InHyphenLatWord},
-	{TPS_InHyphenCyrWordFirst, actionTPS_InHyphenCyrWordFirst},
-	{TPS_InHyphenCyrWord, actionTPS_InHyphenCyrWord},
-	{TPS_InHyphenUWordFirst, actionTPS_InHyphenUWordFirst},
-	{TPS_InHyphenUWord, actionTPS_InHyphenUWord},
+	{TPS_InHyphenAsciiWordFirst, actionTPS_InHyphenAsciiWordFirst},
+	{TPS_InHyphenAsciiWord, actionTPS_InHyphenAsciiWord},
+	{TPS_InHyphenWordFirst, actionTPS_InHyphenWordFirst},
+	{TPS_InHyphenWord, actionTPS_InHyphenWord},
+	{TPS_InHyphenNumWordFirst, actionTPS_InHyphenNumWordFirst},
+	{TPS_InHyphenNumWord, actionTPS_InHyphenNumWord},
 	{TPS_InHyphenValueFirst, actionTPS_InHyphenValueFirst},
 	{TPS_InHyphenValue, actionTPS_InHyphenValue},
 	{TPS_InHyphenValueExact, actionTPS_InHyphenValueExact},
 	{TPS_InParseHyphen, actionTPS_InParseHyphen},
 	{TPS_InParseHyphenHyphen, actionTPS_InParseHyphenHyphen},
-	{TPS_InHyphenCyrWordPart, actionTPS_InHyphenCyrWordPart},
-	{TPS_InHyphenLatWordPart, actionTPS_InHyphenLatWordPart},
-	{TPS_InHyphenUWordPart, actionTPS_InHyphenUWordPart},
+	{TPS_InHyphenWordPart, actionTPS_InHyphenWordPart},
+	{TPS_InHyphenAsciiWordPart, actionTPS_InHyphenAsciiWordPart},
+	{TPS_InHyphenNumWordPart, actionTPS_InHyphenNumWordPart},
 	{TPS_InHyphenUnsignedInt, actionTPS_InHyphenUnsignedInt},
 	{TPS_InHDecimalPartFirst, actionTPS_InHDecimalPartFirst},
 	{TPS_InHDecimalPart, actionTPS_InHDecimalPart},
@ -1378,10 +1379,11 @@ TParserGet(TParser * prs)
 {
 	TParserStateActionItem *item = NULL;

+	Assert(prs->state);
+
 	if (prs->state->posbyte >= prs->lenstr)
 		return false;

-	Assert(prs->state);
 	prs->lexeme = prs->str + prs->state->posbyte;
 	prs->state->pushedAtAction = NULL;

@ -1488,10 +1490,12 @@ TParserGet(TParser * prs)
 			prs->state->state = item->tostate;

 		/* check for go away */
-		if ((item->flags & A_BINGO) || (prs->state->posbyte >= prs->lenstr && (item->flags & A_RERUN) == 0))
+		if ((item->flags & A_BINGO) ||
+			(prs->state->posbyte >= prs->lenstr &&
+			 (item->flags & A_RERUN) == 0))
 			break;

-		/* go to begining of loop if we should rerun or we just restore state */
+		/* go to beginning of loop if we should rerun or we just restore state */
 		if (item->flags & (A_RERUN | A_POP))
 			continue;

@ -1557,16 +1561,15 @@ prsd_end(PG_FUNCTION_ARGS)
 	PG_RETURN_VOID();
 }

-#define LEAVETOKEN(x)	( (x)==12 )
-#define COMPLEXTOKEN(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
-#define ENDPUNCTOKEN(x) ( (x)==12 )
+#define LEAVETOKEN(x)	( (x)==SPACE )
+#define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
+#define ENDPUNCTOKEN(x) ( (x)==SPACE )

-
-#define TS_IDIGNORE(x) ( (x)==13 || (x)==14 || (x)==12 || (x)==23 )
-#define HLIDIGNORE(x) ( (x)==5 || (x)==13 || (x)==15 || (x)==16 || (x)==17 )
-#define HTMLHLIDIGNORE(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
-#define NONWORDTOKEN(x) ( (x)==12 || HLIDIGNORE(x) )
-#define NOENDTOKEN(x)	( NONWORDTOKEN(x) || (x)==7 || (x)==8 || (x)==20 || (x)==21 || (x)==22 || TS_IDIGNORE(x) )
+#define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==HTMLENTITY )
+#define HLIDIGNORE(x) ( (x)==URL_T || (x)==TAG_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
+#define HTMLHLIDIGNORE(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
+#define NONWORDTOKEN(x) ( (x)==SPACE || HLIDIGNORE(x) )
+#define NOENDTOKEN(x)	( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )

 typedef struct
 {
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@ -37,7 +37,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.435 2007/10/22 20:13:37 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.436 2007/10/23 20:46:12 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -53,6 +53,6 @@
 */

 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	200710221
+#define CATALOG_VERSION_NO	200710231

 #endif
--- a/src/test/regress/expected/tsdicts.out
+++ b/src/test/regress/expected/tsdicts.out
@ -209,8 +209,8 @@ SELECT ts_lexize('synonym', 'Gogle');
 (1 row)

 -- Create and simple test thesaurus dictionary
-- More test in configuration checks because of ts_lexize
-- can not give more tat one word as it may wish thesaurus.
+-- More tests in configuration checks because ts_lexize()
+-- cannot pass more than one word to thesaurus.
 CREATE TEXT SEARCH DICTIONARY thesaurus (
                        Template=thesaurus,
 						DictFile=thesaurus_sample, 
@ -227,7 +227,7 @@ CREATE TEXT SEARCH CONFIGURATION ispell_tst (
 						COPY=english
 );
 ALTER TEXT SEARCH CONFIGURATION ispell_tst ALTER MAPPING FOR
-	hword, lhword, lpart_hword, lword, nlhword, nlpart_hword, nlword, part_hword, word 
+	word, numword, asciiword, hword, numhword, asciihword, hword_part, hword_numpart, hword_asciipart
 	WITH ispell, english_stem;
 SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot');
                                            to_tsvector                                             
@ -276,7 +276,7 @@ CREATE TEXT SEARCH CONFIGURATION synonym_tst (
 						COPY=english
 );
 ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR 
-	lword, lpart_hword, lhword 
+	asciiword, hword_asciipart, asciihword 
 	WITH synonym, english_stem;
 SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
                    to_tsvector                    
@ -296,7 +296,7 @@ CREATE TEXT SEARCH CONFIGURATION thesaurus_tst (
 						COPY=synonym_tst
 );
 ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR 
-	lword, lpart_hword, lhword 
+	asciiword, hword_asciipart, asciihword 
 	WITH synonym, thesaurus, english_stem;
 SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one');
           to_tsvector            
--- a/src/test/regress/expected/tsearch.out
+++ b/src/test/regress/expected/tsearch.out
@ -209,30 +209,30 @@ SELECT ts_lexize('english_stem', 'identity');

 SELECT * FROM ts_token_type('default');
 tokid |      alias      |               description                
-------+--------------+-----------------------------------
-     1 | lword        | Latin word
-     2 | nlword       | Non-latin word
-     3 | word         | Word
-     4 | email        | Email
+-------+-----------------+------------------------------------------
+     1 | asciiword       | Word, all ASCII
+     2 | word            | Word, all letters
+     3 | numword         | Word, letters and digits
+     4 | email           | Email address
     5 | url             | URL
     6 | host            | Host
     7 | sfloat          | Scientific notation
-     8 | version      | VERSION
-     9 | part_hword   | Part of hyphenated word
-    10 | nlpart_hword | Non-latin part of hyphenated word
-    11 | lpart_hword  | Latin part of hyphenated word
+     8 | version         | Version number
+     9 | hword_numpart   | Hyphenated word part, letters and digits
+    10 | hword_part      | Hyphenated word part, all letters
+    11 | hword_asciipart | Hyphenated word part, all ASCII
    12 | blank           | Space symbols
-    13 | tag          | HTML Tag
+    13 | tag             | HTML tag
    14 | protocol        | Protocol head
-    15 | hword        | Hyphenated word
-    16 | lhword       | Latin hyphenated word
-    17 | nlhword      | Non-latin hyphenated word
+    15 | numhword        | Hyphenated word, letters and digits
+    16 | asciihword      | Hyphenated word, all ASCII
+    17 | hword           | Hyphenated word, all letters
    18 | uri             | URI
    19 | file            | File or path name
    20 | float           | Decimal notation
    21 | int             | Signed integer
    22 | uint            | Unsigned integer
-    23 | entity       | HTML Entity
+    23 | entity          | HTML entity
 (23 rows)

 SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/?  ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
--- a/src/test/regress/sql/tsdicts.sql
+++ b/src/test/regress/sql/tsdicts.sql
@ -58,8 +58,8 @@ SELECT ts_lexize('synonym', 'PoStGrEs');
 SELECT ts_lexize('synonym', 'Gogle');

 -- Create and simple test thesaurus dictionary
-- More test in configuration checks because of ts_lexize
-- can not give more tat one word as it may wish thesaurus.
+-- More tests in configuration checks because ts_lexize()
+-- cannot pass more than one word to thesaurus.
 CREATE TEXT SEARCH DICTIONARY thesaurus (
                        Template=thesaurus,
 						DictFile=thesaurus_sample, 
@ -74,7 +74,7 @@ CREATE TEXT SEARCH CONFIGURATION ispell_tst (
 );

 ALTER TEXT SEARCH CONFIGURATION ispell_tst ALTER MAPPING FOR
-	hword, lhword, lpart_hword, lword, nlhword, nlpart_hword, nlword, part_hword, word 
+	word, numword, asciiword, hword, numhword, asciihword, hword_part, hword_numpart, hword_asciipart
 	WITH ispell, english_stem;

 SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot');
@ -99,7 +99,7 @@ CREATE TEXT SEARCH CONFIGURATION synonym_tst (
 );

 ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR 
-	lword, lpart_hword, lhword 
+	asciiword, hword_asciipart, asciihword 
 	WITH synonym, english_stem;

 SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
@ -112,10 +112,9 @@ CREATE TEXT SEARCH CONFIGURATION thesaurus_tst (
 );

 ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR 
-	lword, lpart_hword, lhword 
+	asciiword, hword_asciipart, asciihword 
 	WITH synonym, thesaurus, english_stem;

 SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one');
 SELECT to_tsvector('thesaurus_tst', 'Supernovae star is very new star and usually called supernovae (abbrevation SN)');
 SELECT to_tsvector('thesaurus_tst', 'Booking tickets is looking like a booking a tickets');
-
--- a/src/tools/msvc/Install.pm
+++ b/src/tools/msvc/Install.pm
@ -3,7 +3,7 @@ package Install;
 #
 # Package that provides 'make install' functionality for msvc builds
 #
-# $PostgreSQL: pgsql/src/tools/msvc/Install.pm,v 1.24 2007/10/16 16:00:00 tgl Exp $
+# $PostgreSQL: pgsql/src/tools/msvc/Install.pm,v 1.25 2007/10/23 20:46:12 tgl Exp $
 #
 use strict;
 use warnings;
@ -258,7 +258,7 @@ sub GenerateTsearchFiles
    while ($#pieces > 0)
    {
        my $lang = shift @pieces || last;
-        my $latlang = shift @pieces || last;
+        my $asclang = shift @pieces || last;
        my $txt = $tmpl;
        my $stop = '';

@ -269,8 +269,8 @@ sub GenerateTsearchFiles
        $txt =~ s#_LANGNAME_#${lang}#gs;
        $txt =~ s#_DICTNAME_#${lang}_stem#gs;
        $txt =~ s#_CFGNAME_#${lang}#gs;
-        $txt =~ s#_LATDICTNAME_#${latlang}_stem#gs;
-        $txt =~ s#_NONLATDICTNAME_#${lang}_stem#gs;
+        $txt =~ s#_ASCDICTNAME_#${asclang}_stem#gs;
+        $txt =~ s#_NONASCDICTNAME_#${lang}_stem#gs;
        $txt =~ s#_STOPWORDS_#$stop#gs;
        print $F $txt;
        print ".";