[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: mk_wcwidth
> The current version is not limited to the BMP.
my fault, i was thinking wchar_t was a 16 bit type for some
reason..
> > mk_wcwidth could use some simplification
>
> Please elaborate ...
struct interval
{
int first;
int last;
int width
};
static int bisearch(wchar_t ucs, const struct interval *table, int max)
{
int min = 0;
int mid;
if (ucs < table[0].first || ucs > table[max].last)
while (max >= min)
{
mid = (min + max) / 2;
if (ucs > table[mid].last)
min = mid + 1;
else if (ucs < table[mid].first)
max = mid - 1;
else
return table[mid].width;
}
return -1;
}
int mk_wcwidth(wchar_t ucs)
{
static const struct interval widthset[] = { ... };
return bisearch(ucs, widthset, sizeof(widthset) / sizeof(struct
interval) - 1);
}
this way all the logic would be contained in the "widthset" array
even better: using a btree instead of a bisearch; itd be
a bit tougher to read the source: probably better to have it
generated in that case. Ive attached an example of that style;
it uses hand balanced trees to minimize the search to determine
if a character belongs in a language. I think a perl script
code generator would be needed do the tree-stringing in the width
case, then the focus would be on tweaking the
(range/width/frequency) data.
if you think im off my rocker, let me know. I think owen
tore up most of my arguments about glib's utf-8 code pretty
convincingly :)//unichar.h
//
#ifndef unichar_h_already_included
#define unichar_h_already_included
#include <stdint.h>
typedef int32_t unichar;
#endif
//unicode_class.cpp
#include "unicode_class.h"
class UCSRANGE
{
public:
unichar start;
unichar end;
const UCSRANGE *left;
const UCSRANGE *right;
};
static inline bool treesearch(int val,const UCSRANGE *at)
{
do
{
if( val < at->start)
at=at->left;
else if( val > at->end)
at=at->right;
else
return true;
}
while(at);
return false;
}
/*
katakana
reserved(30A0 - 30FF)
hiragana
reserved(3040 - 309F)
kana
3040 - 30FF
kanji
cjk compat: 3300-33FF
cjk ext.A: 3400 - 4DBF
cjk unif: 4E00 - 9FAF
cjk compat ideo: F900 - FAFF
cjk ext.B: 20000 - 2A6DF
cjk supll: 2F800 - 2FA1F
other
#cjk radicals: 2E80- 2EFF
#kanxi rads: 2F00- 2FDF
#ideo descr: 2FF0- 2FFF
cjk punct: 3000- 303F
cjk enclosed: 3200- 32FF
cjk compat punct: FE30 - FE4F
sf_roomaji
FF00 - FFEF
nihonmoji
kana+kanji+other+sf
3000-30FF, 3200-4DBF, 4E00-9FAF, F900-FAFF, FE30-FE4F, 20000-2A6DF, 2F800-2FA1F
viet
nihonmoji tree:: CJK unif
|
|-----------+-----------|
punct+kana ext.B
| |
| |----+-----|
enclosed+compat+ext.A compatideo suppl
|
compatpunct
*/
static const UCSRANGE nihon_suppl =
{
0x2F800, 0x2FA1F, 0, 0
};
static const UCSRANGE nihon_moji_compat_punct =
{
0xFE30, 0xFE4F, 0, 0
};
static const UCSRANGE nihon_moji_compat_ideo =
{
0xF900, 0xFAFF, 0, &nihon_moji_compat_punct
};
static const UCSRANGE nihon_moji_enclosed_extA_compat =
{
0x3200, 0x4DBF, 0, 0
};
static const UCSRANGE nihon_moji_extB =
{
0x20000, 0x2A6DF, &nihon_moji_compat_ideo, &nihon_suppl
};
static const UCSRANGE nihon_moji_punct_kana =
{
0x3000, 0x30FF, 0, &nihon_moji_enclosed_extA_compat
};
static const UCSRANGE nihon_moji_cjk_unif =
{
0x4E00, 0x9FAF, &nihon_moji_punct_kana, &nihon_moji_extB
};
static const UCSRANGE nihon_kanji_compat_ideo =
{
0xF900, 0xFAFF, 0, 0
};
static const UCSRANGE nihon_kanji_extA_compat =
{
0x3300, 0x4DBF, 0, 0
};
static const UCSRANGE nihon_kanji_extB =
{
0x20000, 0x2A6DF, &nihon_kanji_compat_ideo, &nihon_suppl
};
static const UCSRANGE nihon_kanji_cjk_unif =
{
0x4E00, 0x9FAF, &nihon_kanji_extA_compat, &nihon_kanji_extB
};
namespace UnicodeClassifier
{
bool is_hiragana( unichar ch )
{
return (ch>=0x3040) && (ch<=0x309F);
}
bool is_katakana( unichar ch )
{
return (ch>=0x30A0) && (ch<=0x30FF);
}
bool is_kana( unichar ch )
{
return (ch>=0x3040) && (ch<=0x30FF);
}
bool is_kanji( unichar ch )
{
return treesearch(ch,&nihon_kanji_cjk_unif);
}
bool is_sf_roomaji( unichar ch )
{
return (ch>=0xFF00) && (ch<=0xFFEE);
}
bool is_nihonmoji( unichar ch )
{
return treesearch(ch,&nihon_moji_cjk_unif);
}
}//end namespace UnicodeClassifier
//Vietnamese:
/*
viet tree:: tcvn 6909 based + dong sign
ascii:
0020-007E
| 00A0
| 00C0-00C3
| 00C8-00CA
| 00CC-00CD
| 00D2-00D5
| |--00D9-00DA
| | 00DD
| | 00E0-00E3
| | 00E8-00EA
| | 00EC-00ED
| |--00F2-00F5
| | | 00F9-00FA
| | | 00FD
| | | 0102-0103
| | | 0110-0111
| | |--0128-0129
| | 0168-0169
| | 01A0-01A1
| | 01AF-01B0
| | 0300-0303 -|
| | 0306 |
| | 0309 |-accent marks
| | 031B |
| | 0323 -|
| |
|---1EA0-1EF9 high vowels
|
|--201C-201D double quotes
|
|--20AB dong sign
*/
//diacritics
static const UCSRANGE viet_diacr_2 =
{
0x0309, 0x0309, 0, 0
};
static const UCSRANGE viet_diacr_4 =
{
0x0323, 0x0323, 0, 0
};
static const UCSRANGE viet_diacr_3 =
{
0x031B, 0x031B, &viet_diacr_2, &viet_diacr_4
};
static const UCSRANGE viet_diacr_0 =
{
0x0300, 0x0303, 0, 0
};
//level 7
static const UCSRANGE viet_nbsp =
{
0x00A0,0x00A0, 0, 0
};
static const UCSRANGE viet_letter_I =
{
0x00CC, 0x00CD, 0, 0
};
static const UCSRANGE viet_letter_Y =
{
0x00DD,0x00DD, 0, 0
};
static const UCSRANGE viet_letter_y =
{
0x00FD,0x00FD, 0, 0
};
static const UCSRANGE viet_diacr_1 =
{
0x0306, 0x0306, &viet_diacr_0, &viet_diacr_3
};
//level 6
static const UCSRANGE viet_letter_A =
{
0x00C0, 0x00C3, &viet_nbsp, 0
};
static const UCSRANGE viet_letter_O =
{
0x00D2,0x00D5, &viet_letter_I, 0
};
static const UCSRANGE viet_letter_a =
{
0x00E0,0x00E3 , &viet_letter_Y, 0
};
static const UCSRANGE viet_letter_i =
{
0x00EC,0x00ED, 0, 0
};
static const UCSRANGE viet_letter_u =
{
0x00F9,0x00FA, 0, &viet_letter_y
};
static const UCSRANGE viet_letter_d =
{
0x0110,0x0111,0,0
};
static const UCSRANGE viet_letter_u2 =
{
0x0168,0x0169,0,0
};
static const UCSRANGE viet_letter_u3 =
{
0x01AF,0x01B0, 0, &viet_diacr_1
};
//level 5
static const UCSRANGE viet_letter_E =
{
0x00C8,0x00CA, &viet_letter_A, &viet_letter_O
};
static const UCSRANGE viet_letter_e =
{
0x00E8,0x00EA, &viet_letter_a, &viet_letter_i
};
static const UCSRANGE viet_letter_a2 =
{
0x0102,0x0103, &viet_letter_u, &viet_letter_d
};
static const UCSRANGE viet_letter_o2 =
{
0x01A0,0x01A1, &viet_letter_u2, &viet_letter_u3
};
//level 4
static const UCSRANGE viet_letter_U =
{
0x00D9,0x00DA, &viet_letter_E, &viet_letter_e
};
static const UCSRANGE viet_letter_i2 =
{
0x0128,0x0129, &viet_letter_a2, &viet_letter_o2
};
static const UCSRANGE viet_dong =
{
0x20AB, 0x20AB, 0, 0
};
//level 3
//directional double quotes
static const UCSRANGE viet_dbl_quotes =
{
0x201C, 0x201D, 0, &viet_dong
};
static const UCSRANGE viet_letter_o =
{
0x00F2,0x00F5, &viet_letter_U, &viet_letter_i2
};
//level 2
//high viet vowel range
static const UCSRANGE viet_high_vowels =
{
0x1EA0, 0x1EF9, &viet_letter_o, &viet_dbl_quotes
};
//ascii
//this is the trunk, most code points stop here
static const UCSRANGE viet_ascii =
{
0x0020, 0x007E, 0, &viet_high_vowels
};
namespace UnicodeClassifier
{
bool is_viet( unichar ch )
{
return treesearch(ch,&viet_ascii);
}
}//end namespace UnicodeClassifier
//unicode_class.h
#ifndef __UNICODE_CLASS_H__
#define __UNICODE_CLASS_H__
#include "unichar.h"
namespace UnicodeClassifier
{
//Vietnamese:
bool is_viet( unichar ch );
//Japanese:
bool is_hiragana( unichar ch );
bool is_katakana( unichar ch );
bool is_kana( unichar ch );
bool is_kanji( unichar ch );
bool is_sf_roomaji( unichar ch );
bool is_nihonmoji( unichar ch );
}
#endif