[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: mk_wcwidth



> The current version is not limited to the BMP.
my fault, i was thinking wchar_t was a 16 bit type for some
reason..
 

> > mk_wcwidth could use some simplification
> 
> Please elaborate ...

struct interval
{
  int first;
  int last;
  int width
};

static int bisearch(wchar_t ucs, const struct interval *table, int max)
{
  int min = 0;
  int mid;

  if (ucs < table[0].first || ucs > table[max].last)
    while (max >= min)
    {
      mid = (min + max) / 2;
      if (ucs > table[mid].last)
        min = mid + 1;
      else if (ucs < table[mid].first)
        max = mid - 1;
      else
        return table[mid].width;
    }

  return -1;
}

int mk_wcwidth(wchar_t ucs)
{
  static const struct interval widthset[] = { ... };

  return bisearch(ucs, widthset, sizeof(widthset) / sizeof(struct
interval) - 1);
}

this way all the logic would be contained in the "widthset" array

even better: using a btree instead of a bisearch; itd be
a bit tougher to read the source: probably better to have it
generated in that case. Ive attached an example of that style;
it uses hand balanced trees to minimize the search to determine
if a character belongs in a language. I think a perl script
code generator would be needed do the tree-stringing in the width
case, then the focus would be on tweaking the 
(range/width/frequency) data.

if you think im off my rocker, let me know. I think owen
tore up most of my arguments about glib's utf-8 code pretty
convincingly :)
//unichar.h
//


#ifndef unichar_h_already_included
#define unichar_h_already_included

#include <stdint.h>

typedef int32_t unichar;

#endif


//unicode_class.cpp

#include "unicode_class.h"

class UCSRANGE
{
public:
	unichar start;
	unichar end;

	const UCSRANGE *left;
	const UCSRANGE *right;
};

static inline bool treesearch(int val,const UCSRANGE *at)
{
	do
	{
		if( val < at->start)
			at=at->left;
		else if( val > at->end)
			at=at->right;
		else
			return true;
	}
	while(at);

	return false;
}


/*
katakana
	reserved(30A0 - 30FF)

hiragana
	reserved(3040 - 309F)

kana
	3040 - 30FF

kanji
	cjk compat: 3300-33FF
	cjk ext.A: 3400 - 4DBF
	cjk unif: 4E00 - 9FAF
	cjk compat ideo: F900 - FAFF
	cjk ext.B: 20000 - 2A6DF
	cjk supll: 2F800 - 2FA1F

other
		#cjk radicals: 2E80- 2EFF
		#kanxi rads: 2F00- 2FDF
		#ideo descr: 2FF0- 2FFF
	cjk punct:   3000- 303F
	cjk enclosed: 3200- 32FF
	cjk compat punct: FE30 - FE4F

sf_roomaji
	FF00 - FFEF

nihonmoji
	kana+kanji+other+sf

	3000-30FF, 3200-4DBF, 4E00-9FAF, F900-FAFF, FE30-FE4F, 20000-2A6DF, 2F800-2FA1F

viet

nihonmoji tree::                  CJK unif
                                   |
                       |-----------+-----------|
                  punct+kana                 ext.B
                           |                  |
                           |             |----+-----|
             enclosed+compat+ext.A    compatideo   suppl
                                           |
                                     compatpunct

*/
static const UCSRANGE nihon_suppl =
{
	0x2F800, 0x2FA1F, 0, 0
};

static const UCSRANGE nihon_moji_compat_punct =
{
	0xFE30, 0xFE4F, 0, 0
};
static const UCSRANGE nihon_moji_compat_ideo =
{
	0xF900, 0xFAFF, 0, &nihon_moji_compat_punct
};
static const UCSRANGE nihon_moji_enclosed_extA_compat =
{
	0x3200, 0x4DBF, 0, 0
};
static const UCSRANGE nihon_moji_extB =
{
	0x20000, 0x2A6DF, &nihon_moji_compat_ideo, &nihon_suppl
};
static const UCSRANGE nihon_moji_punct_kana =
{
	0x3000, 0x30FF, 0, &nihon_moji_enclosed_extA_compat
};
static const UCSRANGE nihon_moji_cjk_unif =
{
	0x4E00, 0x9FAF, &nihon_moji_punct_kana, &nihon_moji_extB
};

static const UCSRANGE nihon_kanji_compat_ideo =
{
	0xF900, 0xFAFF, 0, 0
};
static const UCSRANGE nihon_kanji_extA_compat =
{
	0x3300, 0x4DBF, 0, 0
};
static const UCSRANGE nihon_kanji_extB =
{
	0x20000, 0x2A6DF, &nihon_kanji_compat_ideo, &nihon_suppl
};
static const UCSRANGE nihon_kanji_cjk_unif =
{
	0x4E00, 0x9FAF, &nihon_kanji_extA_compat, &nihon_kanji_extB
};

namespace UnicodeClassifier
{

bool is_hiragana( unichar ch )
{
	return (ch>=0x3040) && (ch<=0x309F);
}
bool is_katakana( unichar ch )
{
	return (ch>=0x30A0) && (ch<=0x30FF);
}
bool is_kana( unichar ch )
{
	return (ch>=0x3040) && (ch<=0x30FF);
}

bool is_kanji( unichar ch )
{
	return treesearch(ch,&nihon_kanji_cjk_unif);
}

bool is_sf_roomaji( unichar ch )
{
	return (ch>=0xFF00) && (ch<=0xFFEE);
}

bool is_nihonmoji( unichar ch )
{
	return treesearch(ch,&nihon_moji_cjk_unif);
}

}//end namespace UnicodeClassifier


//Vietnamese:
/*
viet tree:: tcvn 6909 based + dong sign

ascii: 
0020-007E
|                       00A0
|                   00C0-00C3
|               00C8-00CA
|                       00CC-00CD
|                   00D2-00D5
|        |--00D9-00DA
|        |              00DD 
|        |          00E0-00E3
|        |      00E8-00EA
|        |          00EC-00ED
|    |--00F2-00F5
|    |   |          00F9-00FA
|    |   |              00FD
|    |   |      0102-0103
|    |   |          0110-0111
|    |   |--0128-0129
|    |              0168-0169
|    |          01A0-01A1
|    |             	01AF-01B0
|    |                      0300-0303 -|
|    |                  0306           |
|    |                          0309   |-accent marks
|    |                      031B       |
|    |                          0323  -|
|    |
|---1EA0-1EF9   high vowels
     |
     |--201C-201D  double quotes
	     |
         |--20AB  dong sign
*/



//diacritics
static const UCSRANGE viet_diacr_2 =
{
	0x0309, 0x0309, 0, 0
};
static const UCSRANGE viet_diacr_4 =
{
	0x0323, 0x0323, 0, 0
};
static const UCSRANGE viet_diacr_3 =
{
	0x031B, 0x031B, &viet_diacr_2, &viet_diacr_4
};
static const UCSRANGE viet_diacr_0 =
{
	0x0300, 0x0303, 0, 0
};
//level 7
static const UCSRANGE viet_nbsp =
{
	0x00A0,0x00A0, 0, 0
};
static const UCSRANGE viet_letter_I =
{
	0x00CC, 0x00CD, 0, 0
};
static const UCSRANGE viet_letter_Y =
{
	0x00DD,0x00DD, 0, 0
};
static const UCSRANGE viet_letter_y =
{
	0x00FD,0x00FD, 0, 0
};
static const UCSRANGE viet_diacr_1 =
{
	0x0306, 0x0306, &viet_diacr_0, &viet_diacr_3
};
//level 6
static const UCSRANGE viet_letter_A =
{
	0x00C0, 0x00C3, &viet_nbsp, 0
};
static const UCSRANGE viet_letter_O =
{
	0x00D2,0x00D5, &viet_letter_I, 0
};
static const UCSRANGE viet_letter_a =
{
	0x00E0,0x00E3 , &viet_letter_Y, 0
};
static const UCSRANGE viet_letter_i =
{
	0x00EC,0x00ED, 0, 0
};
static const UCSRANGE viet_letter_u =
{
	0x00F9,0x00FA, 0, &viet_letter_y
};
static const UCSRANGE viet_letter_d =
{
	0x0110,0x0111,0,0
};
static const UCSRANGE viet_letter_u2 =
{
	0x0168,0x0169,0,0
};
static const UCSRANGE viet_letter_u3 =
{
	0x01AF,0x01B0, 0, &viet_diacr_1
};
//level 5
static const UCSRANGE viet_letter_E =
{
	0x00C8,0x00CA, &viet_letter_A, &viet_letter_O
};
static const UCSRANGE viet_letter_e =
{
	0x00E8,0x00EA, &viet_letter_a, &viet_letter_i
};
static const UCSRANGE viet_letter_a2 =
{
	0x0102,0x0103, &viet_letter_u, &viet_letter_d
};
static const UCSRANGE viet_letter_o2 =
{
	0x01A0,0x01A1, &viet_letter_u2, &viet_letter_u3
};

//level 4
static const UCSRANGE viet_letter_U =
{
	0x00D9,0x00DA, &viet_letter_E, &viet_letter_e
};
static const UCSRANGE viet_letter_i2 =
{
	0x0128,0x0129, &viet_letter_a2, &viet_letter_o2
};
static const UCSRANGE viet_dong =
{
	0x20AB, 0x20AB, 0, 0
};

//level 3
//directional double quotes
static const UCSRANGE viet_dbl_quotes =
{
	0x201C, 0x201D, 0, &viet_dong
};

static const UCSRANGE viet_letter_o =
{
	0x00F2,0x00F5, &viet_letter_U, &viet_letter_i2
};

//level 2
//high viet vowel range
static const UCSRANGE viet_high_vowels =
{
	0x1EA0, 0x1EF9, &viet_letter_o, &viet_dbl_quotes
};

//ascii
//this is the trunk, most code points stop here
static const UCSRANGE viet_ascii =
{
	0x0020, 0x007E, 0, &viet_high_vowels
};

namespace UnicodeClassifier
{

bool is_viet( unichar ch )
{
	return treesearch(ch,&viet_ascii);
}

}//end namespace UnicodeClassifier



//unicode_class.h

#ifndef __UNICODE_CLASS_H__
#define __UNICODE_CLASS_H__

#include "unichar.h"

namespace UnicodeClassifier
{

//Vietnamese:

bool is_viet( unichar ch );

//Japanese:

bool is_hiragana( unichar ch );
bool is_katakana( unichar ch );
bool is_kana( unichar ch );

bool is_kanji( unichar ch );

bool is_sf_roomaji( unichar ch );

bool is_nihonmoji( unichar ch );


}

#endif