[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
wcwidth() implementation
Attached is my public domain implementation of the wcwidth() and
wcswidth() functions. I hope you will find it useful for inclusion into
glibc, xterm, etc. The function wcwidth() distinguishes between normal,
wide, and combining characters, and wcswidth() can be used to predict
how many columns a string sent to a terminal emulator such as xterm will
occupy on the screen.
One day in the future, perhaps people want to make wcwidth() locale
dependent (e.g., a UTF-8@JIS locale in which files have UTF-8 encoding
but the exact same width as JIS users are used to, i.e. wide Greek,
etc.). But for the moment, it seems easiest if we just stick to one
simple definition and do not force the issue by introducing yet another
incompatibility.
Once the C library provides such a wcwidth() function, xterm can call it
in order to determine whether to pick a character from the halfwidth or
fullwidth font. This way, applications and terminal emulator will
eventually get the width information from the same source (libc), and
even if people want to have different width conventions for terminal
emulators, it can all be handled by selecting proper locales on all
sides. For the moment however, let's see whether we can stick to just
one single width table, namely the one implemented in the attached
function.
Code and man page also on:
http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.html
http://www.cl.cam.ac.uk/~mgk25/ucs/wcswidth.html
Markus
--
Markus G. Kuhn, Computer Laboratory, University of Cambridge, UK
Email: mkuhn at acm.org, WWW: <http://www.cl.cam.ac.uk/~mgk25/>
/*
* This is an implementation of wcwidth() and wcswidth() as defined in
* "The Single UNIX Specification, Version 2, The Open Group, 1997"
* <http://www.UNIX-systems.org/online.html>
*
* Markus Kuhn -- 2000-02-08 -- public domain
*/
#include <wchar.h>
/* These functions define the column width of an ISO 10646 character
* as follows:
*
* - The null character (U+0000) has a column width of 0.
*
* - Other C0/C1 control characters and DEL will lead to a return
* value of -1.
*
* - Non-spacing and enclosing combining characters (general
* category code Mn or Me in the Unicode database) have a
* column width of 0.
*
* - Spacing characters in the East Asian Wide (W) or East Asian
* FullWidth (F) category as defined in Unicode Technical
* Report #11 have a column width of 2.
*
* - All remaining characters (including all printable
* ISO 8859-1 and WGL4 characters, Unicode control characters,
* etc.) have a column width of 1.
*
* This implementation assumes that wchar_t characters are encoded
* in ISO 10646.
*/
int wcwidth(wchar_t ucs)
{
/* sorted list of non-overlapping intervals of non-spacing characters */
static const struct interval {
unsigned short first;
unsigned short last;
} combining[] = {
{ 0x0300, 0x034E }, { 0x0360, 0x0362 }, { 0x0483, 0x0486 },
{ 0x0488, 0x0489 }, { 0x0591, 0x05A1 }, { 0x05A3, 0x05B9 },
{ 0x05BB, 0x05BD }, { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 },
{ 0x05C4, 0x05C4 }, { 0x064B, 0x0655 }, { 0x0670, 0x0670 },
{ 0x06D6, 0x06E4 }, { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED },
{ 0x0711, 0x0711 }, { 0x0730, 0x074A }, { 0x07A6, 0x07B0 },
{ 0x0901, 0x0902 }, { 0x093C, 0x093C }, { 0x0941, 0x0948 },
{ 0x094D, 0x094D }, { 0x0951, 0x0954 }, { 0x0962, 0x0963 },
{ 0x0981, 0x0981 }, { 0x09BC, 0x09BC }, { 0x09C1, 0x09C4 },
{ 0x09CD, 0x09CD }, { 0x09E2, 0x09E3 }, { 0x0A02, 0x0A02 },
{ 0x0A3C, 0x0A3C }, { 0x0A41, 0x0A42 }, { 0x0A47, 0x0A48 },
{ 0x0A4B, 0x0A4D }, { 0x0A70, 0x0A71 }, { 0x0A81, 0x0A82 },
{ 0x0ABC, 0x0ABC }, { 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 },
{ 0x0ACD, 0x0ACD }, { 0x0B01, 0x0B01 }, { 0x0B3C, 0x0B3C },
{ 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B43 }, { 0x0B4D, 0x0B4D },
{ 0x0B56, 0x0B56 }, { 0x0B82, 0x0B82 }, { 0x0BC0, 0x0BC0 },
{ 0x0BCD, 0x0BCD }, { 0x0C3E, 0x0C40 }, { 0x0C46, 0x0C48 },
{ 0x0C4A, 0x0C4D }, { 0x0C55, 0x0C56 }, { 0x0CBF, 0x0CBF },
{ 0x0CC6, 0x0CC6 }, { 0x0CCC, 0x0CCD }, { 0x0D41, 0x0D43 },
{ 0x0D4D, 0x0D4D }, { 0x0DCA, 0x0DCA }, { 0x0DD2, 0x0DD4 },
{ 0x0DD6, 0x0DD6 }, { 0x0E31, 0x0E31 }, { 0x0E34, 0x0E3A },
{ 0x0E47, 0x0E4E }, { 0x0EB1, 0x0EB1 }, { 0x0EB4, 0x0EB9 },
{ 0x0EBB, 0x0EBC }, { 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 },
{ 0x0F35, 0x0F35 }, { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 },
{ 0x0F71, 0x0F7E }, { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 },
{ 0x0F90, 0x0F97 }, { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 },
{ 0x102D, 0x1030 }, { 0x1032, 0x1032 }, { 0x1036, 0x1037 },
{ 0x1039, 0x1039 }, { 0x1058, 0x1059 }, { 0x17B7, 0x17BD },
{ 0x17C6, 0x17C6 }, { 0x17C9, 0x17D3 }, { 0x18A9, 0x18A9 },
{ 0x20D0, 0x20E3 }, { 0x302A, 0x302F }, { 0x3099, 0x309A },
{ 0xFB1E, 0xFB1E }, { 0xFE20, 0xFE23 }
};
int min = 0;
int max = sizeof(combining) / sizeof(struct interval) - 1;
int mid;
/* test for 8-bit control characters */
if (ucs == 0)
return 0;
if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0))
return -1;
/* first quick check for Latin-1 etc. characters */
if (ucs < combining[0].first)
return 1;
/* binary search in table of non-spacing characters */
while (max >= min) {
mid = (min + max) / 2;
if (combining[mid].last < ucs)
min = mid + 1;
else if (combining[mid].first > ucs)
max = mid - 1;
else if (combining[mid].first <= ucs && combining[mid].last >= ucs)
return 0;
}
/* if we arrive here, ucs is not a combining or C0/C1 control character */
/* fast test for majority of non-wide scripts */
if (ucs < 0x1100)
return 1;
return 1 +
((ucs >= 0x1100 && ucs <= 0x115f) || /* Hangul Jamo */
(ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a &&
ucs != 0x303f) || /* CJK ... Yi */
(ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */
(ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility Ideographs */
(ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */
(ucs >= 0xff00 && ucs <= 0xff5f) || /* Fullwidth Forms */
(ucs >= 0xffe0 && ucs <= 0xffe6));
}
int wcswidth(const wchar_t *pwcs, size_t n)
{
int w, width = 0;
for (;*pwcs && n-- > 0; pwcs++)
if ((w = wcwidth(*pwcs)) < 0)
return -1;
else
width += w;
return width;
}