OS/2 codes: How to support DBCS #2 Manipulate DBCS strings

Unlike SBCS, a DBCS(double-byte character set) character consist of two bytes as its name. ^^ So manipulating DBCS strings needs some different works. For example, strlen() returns a length of a string in bytes not in characters. And if walking through DBCS strings, indice should be moved by 2 bytes.

To solve the above problems, the most basic is to get an information of DBCS lead-bytes. Here, a lead-byte is first byte of a DBCS character. And second byte is called a trail-byte.

On OS/2, there are two ways to do this. One is DosQueryDBCSEnv(), and the other is UniQueryUconvObject(). Using these functions, it is possible to determine a chracter is a SBCS character or a DBCS character. DosQueryDBCSEnv() give a vector of ranges of lead-bytes. Whereas, UniQueryUconvObject() give a vector of a char size in according to a lead-byte. If you prefer compatibility, then use DosQueryDBCSEnv().

If got a information for lead-bytes, now what to do is to determine if a DBCS character. This is easy. DBCS consists of two bytes. First byte, that is, a lower byte is a lead-byte, second byte, that is, a higer byte is a trail-byte. So if checking just a lower byte of a character, it is possible to determine if it is a DBCS character.

However, a more important thing is to determine in a string. In case of a string, it's not possible to determine by an arbitrary character of it, because a string is a set of continuous characters, and unlike a lead byte, a trail-byte has no restrictions. So determining directly with an arbitrary character of it may make a trail-byte to be recognized as a lead-byte. For example, consider this case.

    SSLTLTSSSLTSLTSSSLTLTSSS

Here, S is a SBCS chracter, L is DBCS a lead-byte and T is a DBCS trail-byte. If you choose index 2[L], it is determined correctly to a lead-byte. However, index 4[T] may be mis-recognized as a lead-byte. In this case, the above string is treated like this, now.

    SSLLTLTSSLTSLTSSSLTLTSSS

To avoid this, you should check a string at first to an index of a wanted character. And when determining, note that [unsigned char] is need to process DBCS. [char] cannot process a DBCS character, because it cannot hold a value than 127.

Here are codes for the above explanations.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#include <string.h>

/* Array of a character length, mapping a character code to a length */
static char m_achCharLen[ 256 ];

/* Note unsigned char is needed to access m_achCharLen. */
#define GET_CHAR_LEN( ch ) ( m_achCharLen[( unsigned char )( ch )])

/* Define USE_DOSAPI to 1 for DosQueryDBCSEnv(),
   or to 0 for UniQueryUconvObject(). */


#define USE_DOSAPI  1

#if USE_DOSAPI
#define INCL_DOSNLS
#include <os2.h>

__attribute__((constructor))    /* Call at startup. */
static void InitCharLen( void )
{
    COUNTRYCODE cc = {0, }; /* Default system language and current code page */
    UCHAR       auchDBCSLeadBytes[ 12 ]; /* A vector for lead-bytes range */
    int         i, j;

    /* Assume SBCS by default */
    memset( m_achCharLen, 1, sizeof( m_achCharLen ));
    memset( auchDBCSLeadBytes, 0, sizeof( auchDBCSLeadBytes ));

    DosQueryDBCSEnv( sizeof( auchDBCSLeadBytes ), &cc, auchDBCSLeadBytes );

    /* Lead-bytes ranges end with two zero byte */
    for( i = 0; auchDBCSLeadBytes[ i ] || auchDBCSLeadBytes[ i + 1 ]; i += 2 )
    {
        /* If a DBCS character, set its character size to 2. */
        for( j = auchDBCSLeadBytes[ i ]; j <= auchDBCSLeadBytes[ i + 1 ]; j++ )
            m_achCharLen[ j ] = 2;
    }
}
#else
#include <uconv.h>

static void InitCharLen( void )
{
    UconvObject       uconv;
    uconv_attribute_t attr;
    int               i;

    /* Assume SBCS by default. */
    memset( m_achCharLen, 1, sizeof( m_achCharLen ));

    /* Create UconvObject for a current locale, */
    UniCreateUconvObject(( UniChar *)L"", &uconv );

    /* Get a character length in according to a character code. */
    UniQueryUconvObject( uconv, &attr, sizeof( attr ),
                         m_achCharLen, NULL, NULL );

    UniFreeUconvObject( uconv );

    /* treat a code point as a SBCS character */
    for( i = 0; i < 256; i++ )
        if( m_achCharLen[ i ] == ( char )255 )
            m_achCharLen[ i ] = 1;
}
#endif

static int DBCSStrlen( const char *str )
{
    const char *p;
    int len;

    for( len = 0, p = str; *p;)
    {
        p += GET_CHAR_LEN( *p );    /* DBCS lead-byte ? */
        len++;

        if( !p[ -1 ])   /* For the case of a broken DBCS character,
                           check not to pass an end of a string. */

            break;
    }

    return len;
}

static int IsDBCSLeadByte( const char *str, int pos )
{
    const char *end = str + pos;

    /* Walk through from first. */
    while( str < end )
        str += GET_CHAR_LEN( *str );

    /* Check a character at position. */
    return str == end && GET_CHAR_LEN( *str ) == 2;
}

static int IsDBCSTrailByte( const char *str, int pos )
{
    const char *end = str + pos;

    /* Walk through from first. */
    while( str < end )
        str += GET_CHAR_LEN( *str );

    /* Passed a position ? Then it is a trail-byte. For example, if str is
       LTLT, and pos is 1, then str is at index 2 and end is at index 1. */

    return str > end;
}

#include <stdio.h>

int main( int argc, char *argv[])
{
    const char *s;
    int i;

    if( argc < 2 )
    {
        fprintf(stderr, "Please specify any string !!!");

        return 1;
    }

    s = argv[ 1 ];

    printf("Length of [%s] is %d\n", s, DBCSStrlen( s ));

    for( i = 0; s[ i ]; i++ )
    {
        if( IsDBCSLeadByte( s, i ))
        {
            printf("%3d: DBCS Lead byte\n", i );

            /* Check if a broken a DBCS character. */
            if( s[ i + 1 ])
            {
                printf("%3d: DBCS trail byte\n", i + 1 );
                i++;
            }
        }
        else
            printf("%3d: SBCS\n", i );
    }

    return 0;
}


// ----- 2015/01/12

Locale version


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#include <stdlib.h>

static int DBCSStrlen( const char *str )
{
    const char *p;
    int n;
    int len;

    for( len = 0, p = str; *p;)
    {
        n = mblen( p, MB_CUR_MAX );
        p += n;
        len++;

        if( n < 0 )     /* Broken DBCS character ? */
            break;
    }

    return len;
}

static int IsDBCSLeadByte( const char *str, int pos )
{
    const char *end = str + pos;
    int n;

    /* Walk through from first. */
    while( str < end )
    {
        n = mblen(str, MB_CUR_MAX );
        if( n < 0 )     /* Broken DBCS character ? */
            return 0;

        str += n;
    }

    /* Check a character at position. */
    return str == end && mblen( str, MB_CUR_MAX ) == 2;
}

static int IsDBCSTrailByte( const char *str, int pos )
{
    const char *end = str + pos;
    int n;

    /* Walk through from first. */
    while( str < end )
    {
        n = mblen( str, MB_CUR_MAX );
        if( n < 0 )     /* Broken DBCS character ? */
            return 0;

        str += n;
    }

    /* Passed a position ? Then it is a trail-byte. For example, if str is
       LTLT, and pos is 1, then str is at index 2 and end is at index 1. */

    return str > end;
}


Note : MB_CUR_MAX and mblen() are affected by LC_CTYPE not a codepage. To change a locale, use setlocale(). And the above codes do not consider unicodes, treat them as DBCS.


댓글

이 블로그의 인기 게시물

토렌트: < 왕좌의 게임 > 시즌 1 ~ 시즌 8 완결편 마그넷

토렌트: < 스타워즈 > Ep.1 ~ Ep.6 마그넷

토렌트: NGC < 코스모스 > 우리말 더빙 전편(1편~13편) 마그넷