1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
|
package com.healthmarketscience.jackcess.scsu;
/*
* This sample software accompanies Unicode Technical Report #6 and
* distributed as is by Unicode, Inc., subject to the following:
*
* Copyright � 1996-1998 Unicode, Inc.. All Rights Reserved.
*
* Permission to use, copy, modify, and distribute this software
* without fee is hereby granted provided that this copyright notice
* appears in all copies.
*
* UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
* SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
* UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
* SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
* INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
* OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
*
* @author Asmus Freytag
*
* @version 001 Dec 25 1996
* @version 002 Jun 25 1997
* @version 003 Jul 25 1997
* @version 004 Aug 25 1997
* @version 005 Sep 30 1998
*
* Unicode and the Unicode logo are trademarks of Unicode, Inc.,
* and are registered in some jurisdictions.
**/
/**
Reference decoder for the Standard Compression Scheme for Unicode (SCSU)
<H2>Notes on the Java implementation</H2>
A limitation of Java is the exclusive use of a signed byte data type.
The following work arounds are required:
Copying a byte to an integer variable and adding 256 for 'negative'
bytes gives an integer in the range 0-255.
Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
char values is unsigned.
Extended characters require an int to store them. The sign is not an
issue because only 1024*1024 + 65536 extended characters exist.
**/
public class Expand extends SCSU
{
/** (re-)define (and select) a dynamic window
A sliding window position cannot start at any Unicode value,
so rather than providing an absolute offset, this function takes
an index value which selects among the possible starting values.
Most scripts in Unicode start on or near a half-block boundary
so the default behaviour is to multiply the index by 0x80. Han,
Hangul, Surrogates and other scripts between 0x3400 and 0xDFFF
show very poor locality--therefore no sliding window can be set
there. A jumpOffset is added to the index value to skip that region,
and only 167 index values total are required to select all eligible
half-blocks.
Finally, a few scripts straddle half block boundaries. For them, a
table of fixed offsets is used, and the index values from 0xF9 to
0xFF are used to select these special offsets.
After (re-)defining a windows location it is selected so it is ready
for use.
Recall that all Windows are of the same length (128 code positions).
@param iWindow - index of the window to be (re-)defined
@param bOffset - index for the new offset value
**/
// @005 protected <-- private here and elsewhere
protected void defineWindow(int iWindow, byte bOffset)
throws IllegalInputException
{
int iOffset = (bOffset < 0 ? bOffset + 256 : bOffset);
// 0 is a reserved value
if (iOffset == 0)
{
throw new IllegalInputException();
}
else if (iOffset < gapThreshold)
{
dynamicOffset[iWindow] = iOffset << 7;
}
else if (iOffset < reservedStart)
{
dynamicOffset[iWindow] = (iOffset << 7) + gapOffset;
}
else if (iOffset < fixedThreshold)
{
// more reserved values
throw new IllegalInputException("iOffset == "+iOffset);
}
else
{
dynamicOffset[iWindow] = fixedOffset[iOffset - fixedThreshold];
}
// make the redefined window the active one
selectWindow(iWindow);
}
/** (re-)define (and select) a window as an extended dynamic window
The surrogate area in Unicode allows access to 2**20 codes beyond the
first 64K codes by combining one of 1024 characters from the High
Surrogate Area with one of 1024 characters from the Low Surrogate
Area (see Unicode 2.0 for the details).
The tags SDX and UDX set the window such that each subsequent byte in
the range 80 to FF represents a surrogate pair. The following diagram
shows how the bits in the two bytes following the SDX or UDX, and a
subsequent data byte, map onto the bits in the resulting surrogate pair.
hbyte lbyte data
nnnwwwww zzzzzyyy 1xxxxxxx
high-surrogate low-surrogate
110110wwwwwzzzzz 110111yyyxxxxxxx
@param chOffset - Since the three top bits of chOffset are not needed to
set the location of the extended Window, they are used instead
to select the window, thereby reducing the number of needed command codes.
The bottom 13 bits of chOffset are used to calculate the offset relative to
a 7 bit input data byte to yield the 20 bits expressed by each surrogate pair.
**/
protected void defineExtendedWindow(char chOffset)
{
// The top 3 bits of iOffsetHi are the window index
int iWindow = chOffset >>> 13;
// Calculate the new offset
dynamicOffset[iWindow] = ((chOffset & 0x1FFF) << 7) + (1 << 16);
// make the redefined window the active one
selectWindow(iWindow);
}
/** string buffer length used by the following functions */
protected int iOut = 0;
/** input cursor used by the following functions */
protected int iIn = 0;
/** expand input that is in Unicode mode
@param in input byte array to be expanded
@param iCur starting index
@param sb string buffer to which to append expanded input
@return the index for the lastc byte processed
**/
protected int expandUnicode(byte []in, int iCur, StringBuffer sb)
throws IllegalInputException, EndOfInputException
{
for( ; iCur < in.length-1; iCur+=2 ) // step by 2:
{
byte b = in[iCur];
if (b >= UC0 && b <= UC7)
{
Debug.out("SelectWindow: ", b);
selectWindow(b - UC0);
return iCur;
}
else if (b >= UD0 && b <= UD7)
{
defineWindow( b - UD0, in[iCur+1]);
return iCur + 1;
}
else if (b == UDX)
{
if( iCur >= in.length - 2)
{
break; // buffer error
}
defineExtendedWindow(charFromTwoBytes(in[iCur+1], in[iCur+2]));
return iCur + 2;
}
else if (b == UQU)
{
if( iCur >= in.length - 2)
{
break; // error
}
// Skip command byte and output Unicode character
iCur++;
}
// output a Unicode character
char ch = charFromTwoBytes(in[iCur], in[iCur+1]);
sb.append((char)ch);
iOut++;
}
if( iCur == in.length)
{
return iCur;
}
// Error condition
throw new EndOfInputException();
}
/** assemble a char from two bytes
In Java bytes are signed quantities, while chars are unsigned
@return the character
@param hi most significant byte
@param lo least significant byte
*/
public static char charFromTwoBytes(byte hi, byte lo)
{
char ch = (char)(lo >= 0 ? lo : 256 + lo);
return (char)(ch + (char)((hi >= 0 ? hi : 256 + hi)<<8));
}
/** expand portion of the input that is in single byte mode **/
protected String expandSingleByte(byte []in)
throws IllegalInputException, EndOfInputException
{
/* Allocate the output buffer. Because of control codes, generally
each byte of input results in fewer than one character of
output. Using in.length as an intial allocation length should avoid
the need to reallocate in mid-stream. The exception to this rule are
surrogates. */
StringBuffer sb = new StringBuffer(in.length);
iOut = 0;
// Loop until all input is exhausted or an error occurred
int iCur;
Loop:
for( iCur = 0; iCur < in.length; iCur++ )
{
// DEBUG Debug.out("Expanding: ", iCur);
// Default behaviour is that ASCII characters are passed through
// (staticOffset[0] == 0) and characters with the high bit on are
// offset by the current dynamic (or sliding) window (this.iWindow)
int iStaticWindow = 0;
int iDynamicWindow = getCurrentWindow();
switch(in[iCur])
{
// Quote from a static Window
case SQ0:
case SQ1:
case SQ2:
case SQ3:
case SQ4:
case SQ5:
case SQ6:
case SQ7:
Debug.out("SQn:", iStaticWindow);
// skip the command byte and check for length
if( iCur >= in.length - 1)
{
Debug.out("SQn missing argument: ", in, iCur);
break Loop; // buffer length error
}
// Select window pair to quote from
iDynamicWindow = iStaticWindow = in[iCur] - SQ0;
iCur ++;
// FALL THROUGH
default:
// output as character
if(in[iCur] >= 0)
{
// use static window
int ch = in[iCur] + staticOffset[iStaticWindow];
sb.append((char)ch);
iOut++;
}
else
{
// use dynamic window
int ch = (in[iCur] + 256); // adjust for signed bytes
ch -= 0x80; // reduce to range 00..7F
ch += dynamicOffset[iDynamicWindow];
//DEBUG
Debug.out("Dynamic: ", (char) ch);
if (ch < 1<<16)
{
// in Unicode range, output directly
sb.append((char)ch);
iOut++;
}
else
{
// this is an extension character
Debug.out("Extension character: ", ch);
// compute and append the two surrogates:
// translate from 10000..10FFFF to 0..FFFFF
ch -= 0x10000;
// high surrogate = top 10 bits added to D800
sb.append((char)(0xD800 + (ch>>10)));
iOut++;
// low surrogate = bottom 10 bits added to DC00
sb.append((char)(0xDC00 + (ch & ~0xFC00)));
iOut++;
}
}
break;
// define a dynamic window as extended
case SDX:
iCur += 2;
if( iCur >= in.length)
{
Debug.out("SDn missing argument: ", in, iCur -1);
break Loop; // buffer length error
}
defineExtendedWindow(charFromTwoBytes(in[iCur-1], in[iCur]));
break;
// Position a dynamic Window
case SD0:
case SD1:
case SD2:
case SD3:
case SD4:
case SD5:
case SD6:
case SD7:
iCur ++;
if( iCur >= in.length)
{
Debug.out("SDn missing argument: ", in, iCur -1);
break Loop; // buffer length error
}
defineWindow(in[iCur-1] - SD0, in[iCur]);
break;
// Select a new dynamic Window
case SC0:
case SC1:
case SC2:
case SC3:
case SC4:
case SC5:
case SC6:
case SC7:
selectWindow(in[iCur] - SC0);
break;
case SCU:
// switch to Unicode mode and continue parsing
iCur = expandUnicode(in, iCur+1, sb);
// DEBUG Debug.out("Expanded Unicode range until: ", iCur);
break;
case SQU:
// directly extract one Unicode character
iCur += 2;
if( iCur >= in.length)
{
Debug.out("SQU missing argument: ", in, iCur - 2);
break Loop; // buffer length error
}
else
{
char ch = charFromTwoBytes(in[iCur-1], in[iCur]);
Debug.out("Quoted: ", ch);
sb.append((char)ch);
iOut++;
}
break;
case Srs:
throw new IllegalInputException();
// break;
}
}
if( iCur >= in.length)
{
//SUCCESS: all input used up
sb.setLength(iOut);
iIn = iCur;
return sb.toString();
}
Debug.out("Length ==" + in.length+" iCur =", iCur);
//ERROR: premature end of input
throw new EndOfInputException();
}
/** expand a byte array containing compressed Unicode */
public String expand (byte []in)
throws IllegalInputException, EndOfInputException
{
String str = expandSingleByte(in);
Debug.out("expand output: ", str.toCharArray());
return str;
}
/** reset is called to start with new input, w/o creating a new
instance */
public void reset()
{
iOut = 0;
iIn = 0;
super.reset();
}
public int charsWritten()
{
return iOut;
}
public int bytesRead()
{
return iIn;
}
}
|