Quick Search is a bit awkward with Chinese IME ... or is it?
Moderators: Hacker, petermad, Stefan2, white
Here is the C# source code for get the first letter of the phoneticize spell (a.k.a pinyin) of Chinese:
public string ChineseCap(string ChineseStr)
{
string Capstr="";
byte[] ZW = new byte[2];
long ChineseStr_int;
string CharStr,ChinaStr="";
for (int i=0;i<=ChineseStr.Length-1;i++)
{
CharStr=ChineseStr.Substring(i,1).ToString();
ZW=System.Text.Encoding.Default.GetBytes(CharStr);
// 得到汉字符的字节数组
// Get a ByteArray of Chinese Character
if(ZW.Length==2)
{
int i1 = (short)(ZW[0]);
int i2 = (short)(ZW[1]);
ChineseStr_int=i1*256+i2;
//table of the constant list
// 'A'; //45217..45252
// 'B'; //45253..45760
// 'C'; //45761..46317
// 'D'; //46318..46825
// 'E'; //46826..47009
// 'F'; //47010..47296
// 'G'; //47297..47613
// 'H'; //47614..48118
// 'J'; //48119..49061
// 'K'; //49062..49323
// 'L'; //49324..49895
// 'M'; //49896..50370
// 'N'; //50371..50613
// 'O'; //50614..50621
// 'P'; //50622..50905
// 'Q'; //50906..51386
// 'R'; //51387..51445
// 'S'; //51446..52217
// 'T'; //52218..52697
//没有U,V
// 'W'; //52698..52979
// 'X'; //52980..53640
// 'Y'; //53689..54480
// 'Z'; //54481..55289
if ((ChineseStr_int>=45217) && (ChineseStr_int<=45252))
{
ChinaStr= "A";
}
else if ((ChineseStr_int>=45253) && (ChineseStr_int<=45760))
{
ChinaStr= "B";
}
else if ((ChineseStr_int>=45761) && (ChineseStr_int<=46317))
{
ChinaStr= "C";
}
else if ((ChineseStr_int>=46318) && (ChineseStr_int<=46825))
{
ChinaStr= "D";
}
else if ((ChineseStr_int>=46826) && (ChineseStr_int<=47009))
{
ChinaStr= "E";
}
else if ((ChineseStr_int>=47010) && (ChineseStr_int<=47296))
{
ChinaStr= "F";
}
else if ((ChineseStr_int>=47297) && (ChineseStr_int<=47613))
{
ChinaStr= "G";
}
else if ((ChineseStr_int>=47614) && (ChineseStr_int<=48118))
{
ChinaStr= "H";
}
else if ((ChineseStr_int>=48119) && (ChineseStr_int<=49061))
{
ChinaStr= "J";
}
else if ((ChineseStr_int>=49062) && (ChineseStr_int<=49323))
{
ChinaStr= "K";
}
else if ((ChineseStr_int>=49324) && (ChineseStr_int<=49895))
{
ChinaStr= "L";
}
else if ((ChineseStr_int>=49896) && (ChineseStr_int<=50370))
{
ChinaStr= "M";
}
else if ((ChineseStr_int>=50371) && (ChineseStr_int<=50613))
{
ChinaStr= "N";
}
else if ((ChineseStr_int>=50614) && (ChineseStr_int<=50621))
{
ChinaStr= "O";
}
else if ((ChineseStr_int>=50622) && (ChineseStr_int<=50905))
{
ChinaStr= "P";
}
else if ((ChineseStr_int>=50906) && (ChineseStr_int<=51386))
{
ChinaStr= "Q";
}
else if ((ChineseStr_int>=51387) && (ChineseStr_int<=51445))
{
ChinaStr= "R";
}
else if ((ChineseStr_int>=51446) && (ChineseStr_int<=52217))
{
ChinaStr= "S";
}
else if ((ChineseStr_int>=52218) && (ChineseStr_int<=52697))
{
ChinaStr= "T";
}
else if ((ChineseStr_int>=52698) && (ChineseStr_int<=52979))
{
ChinaStr= "W";
}
else if ((ChineseStr_int>=52980) && (ChineseStr_int<=53640))
{
ChinaStr= "X";
}
else if ((ChineseStr_int>=53689) && (ChineseStr_int<=54480))
{
ChinaStr= "Y";
}
else if ((ChineseStr_int>=54481) && (ChineseStr_int<=55289))
{
ChinaStr= "Z";
}
}
else
{
Capstr=ChineseStr;
break;
}
Capstr=Capstr+ChinaStr;
}
return Capstr;
}
public string ChineseCap(string ChineseStr)
{
string Capstr="";
byte[] ZW = new byte[2];
long ChineseStr_int;
string CharStr,ChinaStr="";
for (int i=0;i<=ChineseStr.Length-1;i++)
{
CharStr=ChineseStr.Substring(i,1).ToString();
ZW=System.Text.Encoding.Default.GetBytes(CharStr);
// 得到汉字符的字节数组
// Get a ByteArray of Chinese Character
if(ZW.Length==2)
{
int i1 = (short)(ZW[0]);
int i2 = (short)(ZW[1]);
ChineseStr_int=i1*256+i2;
//table of the constant list
// 'A'; //45217..45252
// 'B'; //45253..45760
// 'C'; //45761..46317
// 'D'; //46318..46825
// 'E'; //46826..47009
// 'F'; //47010..47296
// 'G'; //47297..47613
// 'H'; //47614..48118
// 'J'; //48119..49061
// 'K'; //49062..49323
// 'L'; //49324..49895
// 'M'; //49896..50370
// 'N'; //50371..50613
// 'O'; //50614..50621
// 'P'; //50622..50905
// 'Q'; //50906..51386
// 'R'; //51387..51445
// 'S'; //51446..52217
// 'T'; //52218..52697
//没有U,V
// 'W'; //52698..52979
// 'X'; //52980..53640
// 'Y'; //53689..54480
// 'Z'; //54481..55289
if ((ChineseStr_int>=45217) && (ChineseStr_int<=45252))
{
ChinaStr= "A";
}
else if ((ChineseStr_int>=45253) && (ChineseStr_int<=45760))
{
ChinaStr= "B";
}
else if ((ChineseStr_int>=45761) && (ChineseStr_int<=46317))
{
ChinaStr= "C";
}
else if ((ChineseStr_int>=46318) && (ChineseStr_int<=46825))
{
ChinaStr= "D";
}
else if ((ChineseStr_int>=46826) && (ChineseStr_int<=47009))
{
ChinaStr= "E";
}
else if ((ChineseStr_int>=47010) && (ChineseStr_int<=47296))
{
ChinaStr= "F";
}
else if ((ChineseStr_int>=47297) && (ChineseStr_int<=47613))
{
ChinaStr= "G";
}
else if ((ChineseStr_int>=47614) && (ChineseStr_int<=48118))
{
ChinaStr= "H";
}
else if ((ChineseStr_int>=48119) && (ChineseStr_int<=49061))
{
ChinaStr= "J";
}
else if ((ChineseStr_int>=49062) && (ChineseStr_int<=49323))
{
ChinaStr= "K";
}
else if ((ChineseStr_int>=49324) && (ChineseStr_int<=49895))
{
ChinaStr= "L";
}
else if ((ChineseStr_int>=49896) && (ChineseStr_int<=50370))
{
ChinaStr= "M";
}
else if ((ChineseStr_int>=50371) && (ChineseStr_int<=50613))
{
ChinaStr= "N";
}
else if ((ChineseStr_int>=50614) && (ChineseStr_int<=50621))
{
ChinaStr= "O";
}
else if ((ChineseStr_int>=50622) && (ChineseStr_int<=50905))
{
ChinaStr= "P";
}
else if ((ChineseStr_int>=50906) && (ChineseStr_int<=51386))
{
ChinaStr= "Q";
}
else if ((ChineseStr_int>=51387) && (ChineseStr_int<=51445))
{
ChinaStr= "R";
}
else if ((ChineseStr_int>=51446) && (ChineseStr_int<=52217))
{
ChinaStr= "S";
}
else if ((ChineseStr_int>=52218) && (ChineseStr_int<=52697))
{
ChinaStr= "T";
}
else if ((ChineseStr_int>=52698) && (ChineseStr_int<=52979))
{
ChinaStr= "W";
}
else if ((ChineseStr_int>=52980) && (ChineseStr_int<=53640))
{
ChinaStr= "X";
}
else if ((ChineseStr_int>=53689) && (ChineseStr_int<=54480))
{
ChinaStr= "Y";
}
else if ((ChineseStr_int>=54481) && (ChineseStr_int<=55289))
{
ChinaStr= "Z";
}
}
else
{
Capstr=ChineseStr;
break;
}
Capstr=Capstr+ChinaStr;
}
return Capstr;
}
Delphi source for Chinese and PinYin:
// Translate Chinese into the first letter of its PinYin. For example, GetHzPy('高三') returns 'GS'
function GetHzPy(const AHzStr: string): string;
const
ChinaCode: array[0..25, 0..1] of Integer = ((1601, 1636), (1637, 1832), (1833, 2077),
(2078, 2273), (2274, 2301), (2302, 2432), (2433, 2593), (2594, 2786), (9999, 0000),
(2787, 3105), (3106, 3211), (3212, 3471), (3472, 3634), (3635, 3722), (3723, 3729),
(3730, 3857), (3858, 4026), (4027, 4085), (4086, 4389), (4390, 4557), (9999, 0000),
(9999, 0000), (4558, 4683), (4684, 4924), (4925, 5248), (5249, 5589));
var
i, j, HzOrd: integer;
Hz: string[2];
begin
i := 1;
while i <= Length(AHzStr) do
begin
if (AHzStr >= #160) and (AHzStr[i + 1] >= #160) then
begin
HzOrd := (Ord(AHzStr) - 160) * 100 + Ord(AHzStr[i + 1]) - 160;
for j := 0 to 25 do
begin
if (HzOrd >= ChinaCode[j][0]) and (HzOrd <= ChinaCode[j][1]) then
begin
Result := Result + char(byte('A') + j);
break;
end;
end;
Inc(i);
end else Result := Result + AHzStr;
Inc(i);
end;
end;
///////////////////////////////////////
// Returns the first letter of PinYin for single Chinese character. For example, GetPYIndexChar('高') returns 'G'
function GetPYIndexChar(hzchar:string):char;
begin
case WORD(hzchar[1]) shl 8 + WORD(hzchar[2]) of
$B0A1..$B0C4 : result := 'A';
$B0C5..$B2C0 : result := 'B';
$B2C1..$B4ED : result := 'C';
$B4EE..$B6E9 : result := 'D';
$B6EA..$B7A1 : result := 'E';
$B7A2..$B8C0 : result := 'F';
$B8C1..$B9FD : result := 'G';
$B9FE..$BBF6 : result := 'H';
$BBF7..$BFA5 : result := 'J';
$BFA6..$C0AB : result := 'K';
$C0AC..$C2E7 : result := 'L';
$C2E8..$C4C2 : result := 'M';
$C4C3..$C5B5 : result := 'N';
$C5B6..$C5BD : result := 'O';
$C5BE..$C6D9 : result := 'P';
$C6DA..$C8BA : result := 'Q';
$C8BB..$C8F5 : result := 'R';
$C8F6..$CBF9 : result := 'S';
$CBFA..$CDD9 : result := 'T';
$CDDA..$CEF3 : result := 'W';
$CEF4..$D188 : result := 'X';
$D1B9..$D4D0 : result := 'Y';
$D4D1..$D7F9 : result := 'Z';
else
result := char(0);
end;
end;
// Translate Chinese into the first letter of its PinYin. For example, GetHzPy('高三') returns 'GS'
function GetHzPy(const AHzStr: string): string;
const
ChinaCode: array[0..25, 0..1] of Integer = ((1601, 1636), (1637, 1832), (1833, 2077),
(2078, 2273), (2274, 2301), (2302, 2432), (2433, 2593), (2594, 2786), (9999, 0000),
(2787, 3105), (3106, 3211), (3212, 3471), (3472, 3634), (3635, 3722), (3723, 3729),
(3730, 3857), (3858, 4026), (4027, 4085), (4086, 4389), (4390, 4557), (9999, 0000),
(9999, 0000), (4558, 4683), (4684, 4924), (4925, 5248), (5249, 5589));
var
i, j, HzOrd: integer;
Hz: string[2];
begin
i := 1;
while i <= Length(AHzStr) do
begin
if (AHzStr >= #160) and (AHzStr[i + 1] >= #160) then
begin
HzOrd := (Ord(AHzStr) - 160) * 100 + Ord(AHzStr[i + 1]) - 160;
for j := 0 to 25 do
begin
if (HzOrd >= ChinaCode[j][0]) and (HzOrd <= ChinaCode[j][1]) then
begin
Result := Result + char(byte('A') + j);
break;
end;
end;
Inc(i);
end else Result := Result + AHzStr;
Inc(i);
end;
end;
///////////////////////////////////////
// Returns the first letter of PinYin for single Chinese character. For example, GetPYIndexChar('高') returns 'G'
function GetPYIndexChar(hzchar:string):char;
begin
case WORD(hzchar[1]) shl 8 + WORD(hzchar[2]) of
$B0A1..$B0C4 : result := 'A';
$B0C5..$B2C0 : result := 'B';
$B2C1..$B4ED : result := 'C';
$B4EE..$B6E9 : result := 'D';
$B6EA..$B7A1 : result := 'E';
$B7A2..$B8C0 : result := 'F';
$B8C1..$B9FD : result := 'G';
$B9FE..$BBF6 : result := 'H';
$BBF7..$BFA5 : result := 'J';
$BFA6..$C0AB : result := 'K';
$C0AC..$C2E7 : result := 'L';
$C2E8..$C4C2 : result := 'M';
$C4C3..$C5B5 : result := 'N';
$C5B6..$C5BD : result := 'O';
$C5BE..$C6D9 : result := 'P';
$C6DA..$C8BA : result := 'Q';
$C8BB..$C8F5 : result := 'R';
$C8F6..$CBF9 : result := 'S';
$CBFA..$CDD9 : result := 'T';
$CDDA..$CEF3 : result := 'W';
$CEF4..$D188 : result := 'X';
$D1B9..$D4D0 : result := 'Y';
$D4D1..$D7F9 : result := 'Z';
else
result := char(0);
end;
end;
You can download a PinYin table for Unicode here :
http://kanji.zinbun.kyoto-u.ac.jp/~yasuoka/CJK.html
or http://kanji.zinbun.kyoto-u.ac.jp/~yasuoka/ftp/CJKtable/Uni2Pinyin.Z
Here is a piece of code:
#
# Name: Unicode Pinyin table
# Unicode version: 1.1
# Table version: 0.496
# Table format: Format A
# Date: 18 August 1997
# Author: Koichi Yasuoka <yasuoka@kanji.zinbun.kyoto-u.ac.jp>
#
# General notes:
#
# This table contains the data on how Unicode Hanzi characters
# are pronounced in P.R.China. This table was originally based
# on "TONEPY.tit" by Yongguang Zhang <ygz@cs.purdue.edu>. Here
# the author expresses his appreciation to Christian Wittern
# <cwittern@conline.central.de>, Jim Breen <jwb@rdt.monash.edu.au>,
# and Jack Halpern <jhalpern@super.win.or.jp>.
#
# Format: Six tab-separated columns
# Column #1 is the Unicode (in hex)
# Columns #2 to #6 are Pinyin (tone '5' means Qingsheng)
#
# The entries are in Unicode order.
#
#
3007 ling2
4E00 yi1
4E01 ding1
4E02 kao3
4E03 qi1
4E04 shang4 shang3
4E05 xia4
4E06
4E07 wan4 mo4
4E08 zhang4
4E09 san1
4E0A shang4 shang3
......
http://kanji.zinbun.kyoto-u.ac.jp/~yasuoka/CJK.html
or http://kanji.zinbun.kyoto-u.ac.jp/~yasuoka/ftp/CJKtable/Uni2Pinyin.Z
Here is a piece of code:
#
# Name: Unicode Pinyin table
# Unicode version: 1.1
# Table version: 0.496
# Table format: Format A
# Date: 18 August 1997
# Author: Koichi Yasuoka <yasuoka@kanji.zinbun.kyoto-u.ac.jp>
#
# General notes:
#
# This table contains the data on how Unicode Hanzi characters
# are pronounced in P.R.China. This table was originally based
# on "TONEPY.tit" by Yongguang Zhang <ygz@cs.purdue.edu>. Here
# the author expresses his appreciation to Christian Wittern
# <cwittern@conline.central.de>, Jim Breen <jwb@rdt.monash.edu.au>,
# and Jack Halpern <jhalpern@super.win.or.jp>.
#
# Format: Six tab-separated columns
# Column #1 is the Unicode (in hex)
# Columns #2 to #6 are Pinyin (tone '5' means Qingsheng)
#
# The entries are in Unicode order.
#
#
3007 ling2
4E00 yi1
4E01 ding1
4E02 kao3
4E03 qi1
4E04 shang4 shang3
4E05 xia4
4E06
4E07 wan4 mo4
4E08 zhang4
4E09 san1
4E0A shang4 shang3
......
- ghisler(Author)
- Site Admin
- Posts: 50390
- Joined: 2003-02-04, 09:46 UTC
- Location: Switzerland
- Contact:
So how does it work? Does the user switch to English keyboard input in QuickSearchPro and type g-s, and this matches all Chinese words where the first character starts with 'g' and the second with 's' (when written in English)?So, QuickSearchPro is not an IME, the author of it could not write an IME. He just make it be smart when users want to search some Chinese without needing load an IME and entering all the Chinese.
Author of Total Commander
https://www.ghisler.com
https://www.ghisler.com
Yes. Most time, Chinese users stay with English keyboard instead of IME. So, in QuickSearchPro, the users type 'g' and 's' only then it will located the cursor to the file that matched all Chinese words where the first character starts with 'g' and the second with 's'. The user doesn't need enter the whole Chinese characters via an IME. I think this is a convenient method to search files which names contain Chinese characters.ghisler(Author) wrote:So how does it work? Does the user switch to English keyboard input in QuickSearchPro and type g-s, and this matches all Chinese words where the first character starts with 'g' and the second with 's' (when written in English)?So, QuickSearchPro is not an IME, the author of it could not write an IME. He just make it be smart when users want to search some Chinese without needing load an IME and entering all the Chinese.
Many database programs or mobile phone address book programs support this feature in China. They call this "First Letter of PinYin Matching".
- ghisler(Author)
- Site Admin
- Posts: 50390
- Joined: 2003-02-04, 09:46 UTC
- Location: Switzerland
- Contact:
OK, now I understand it better!
But I have some questions about your code posted above:
1. Is this really for finding Chinese characters, or just for finding the PinYin representation of the characters with accents?
2. The functions take Ansi strings as parameters, not Unicode. What encoding do we have to pass to the functions?
But I have some questions about your code posted above:
1. Is this really for finding Chinese characters, or just for finding the PinYin representation of the characters with accents?
2. The functions take Ansi strings as parameters, not Unicode. What encoding do we have to pass to the functions?
Author of Total Commander
https://www.ghisler.com
https://www.ghisler.com
just find the *first letter* of the PinYin, no accents. See the comment below:ghisler(Author) wrote:OK, now I understand it better!
But I have some questions about your code posted above:
1. Is this really for finding Chinese characters, or just for finding the PinYin representation of the characters with accents?
// Translate Chinese into the first letter of its PinYin. For example, GetHzPy('高三') returns 'GS'
function GetHzPy(const AHzStr: string): string;
so, before match the user input with a filename, you should call that function to convert the filename into english string that only contains the Chinese character's PinYin. for example: suppose there is a file named "高三.txt", and the user typed "gs" in the QuickSearch dialog. then, you can:
if UpperCase('gs')=GetHzPy(strCurrentFileName) then DoMatch;
I think the encoding should be GBK (CodePage:54936, GB18030 Simplified Chinese).2. The functions take Ansi strings as parameters, not Unicode. What encoding do we have to pass to the functions?
The best method I think is make a full table for Unicode. You can download a PinYin table for Unicode here :
http://kanji.zinbun.kyoto-u.ac.jp/~yasuoka/CJK.html
or http://kanji.zinbun.kyoto-u.ac.jp/~yasuoka/ftp/CJKtable/Uni2Pinyin.Z
- ghisler(Author)
- Site Admin
- Posts: 50390
- Joined: 2003-02-04, 09:46 UTC
- Location: Switzerland
- Contact:
Unfortunately I'm still too confused about it to add it. For example, there seem to be multiple readings of some characters:
4E07 wan4 mo4
So Character 4E07 should be found both when typing 'w' or typing 'm'?
4E07 wan4 mo4
So Character 4E07 should be found both when typing 'w' or typing 'm'?
Author of Total Commander
https://www.ghisler.com
https://www.ghisler.com
YES. This situation called "DuoYinZi"(multi-pronunciation) . If you have no time, support the first 'w' is enough.ghisler(Author) wrote:Unfortunately I'm still too confused about it to add it. For example, there seem to be multiple readings of some characters:
4E07 wan4 mo4
So Character 4E07 should be found both when typing 'w' or typing 'm'?
see here to find something useful.
http://bobcat.webappcabaret.net/javachina/cn/py_aa.htm
Ref:万 4E07 wan4 mo4
- ghisler(Author)
- Site Admin
- Posts: 50390
- Joined: 2003-02-04, 09:46 UTC
- Location: Switzerland
- Contact:
Thanks! I'm currently considering to support some kind of search dll, e.g. wcmd_chn.dll in language dir which would contain such a match function. This way I could offer separate phonetic searches for Chinese (Mandarin), Japanese, Korean...
Author of Total Commander
https://www.ghisler.com
https://www.ghisler.com
GREAT!! It is a good news for all Asia language users!ghisler(Author) wrote:Thanks! I'm currently considering to support some kind of search dll, e.g. wcmd_chn.dll in language dir which would contain such a match function. This way I could offer separate phonetic searches for Chinese (Mandarin), Japanese, Korean...
BTW: Is this feature included in the 7.5 official release?
- ghisler(Author)
- Site Admin
- Posts: 50390
- Joined: 2003-02-04, 09:46 UTC
- Location: Switzerland
- Contact:
Yes it will, but not yet in the upcoming public beta 3 (not enough time)...
Author of Total Commander
https://www.ghisler.com
https://www.ghisler.com