00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #ifndef __BYTESTRIE_H__
00016 #define __BYTESTRIE_H__
00017
00023 #include "unicode/utypes.h"
00024 #include "unicode/stringpiece.h"
00025 #include "unicode/uobject.h"
00026 #include "unicode/ustringtrie.h"
00027
00028 U_NAMESPACE_BEGIN
00029
00030 class ByteSink;
00031 class BytesTrieBuilder;
00032 class CharString;
00033 class UVector32;
00034
00048 class U_COMMON_API BytesTrie : public UMemory {
00049 public:
00064 BytesTrie(const void *trieBytes)
00065 : ownedArray_(NULL), bytes_(static_cast<const uint8_t *>(trieBytes)),
00066 pos_(bytes_), remainingMatchLength_(-1) {}
00067
00072 ~BytesTrie();
00073
00080 BytesTrie(const BytesTrie &other)
00081 : ownedArray_(NULL), bytes_(other.bytes_),
00082 pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
00083
00089 BytesTrie &reset() {
00090 pos_=bytes_;
00091 remainingMatchLength_=-1;
00092 return *this;
00093 }
00094
00100 class State : public UMemory {
00101 public:
00106 State() { bytes=NULL; }
00107 private:
00108 friend class BytesTrie;
00109
00110 const uint8_t *bytes;
00111 const uint8_t *pos;
00112 int32_t remainingMatchLength;
00113 };
00114
00122 const BytesTrie &saveState(State &state) const {
00123 state.bytes=bytes_;
00124 state.pos=pos_;
00125 state.remainingMatchLength=remainingMatchLength_;
00126 return *this;
00127 }
00128
00139 BytesTrie &resetToState(const State &state) {
00140 if(bytes_==state.bytes && bytes_!=NULL) {
00141 pos_=state.pos;
00142 remainingMatchLength_=state.remainingMatchLength;
00143 }
00144 return *this;
00145 }
00146
00153 UStringTrieResult current() const;
00154
00163 inline UStringTrieResult first(int32_t inByte) {
00164 remainingMatchLength_=-1;
00165 if(inByte<0) {
00166 inByte+=0x100;
00167 }
00168 return nextImpl(bytes_, inByte);
00169 }
00170
00178 UStringTrieResult next(int32_t inByte);
00179
00195 UStringTrieResult next(const char *s, int32_t length);
00196
00206 inline int32_t getValue() const {
00207 const uint8_t *pos=pos_;
00208 int32_t leadByte=*pos++;
00209
00210 return readValue(pos, leadByte>>1);
00211 }
00212
00222 inline UBool hasUniqueValue(int32_t &uniqueValue) const {
00223 const uint8_t *pos=pos_;
00224
00225 return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
00226 }
00227
00236 int32_t getNextBytes(ByteSink &out) const;
00237
00242 class U_COMMON_API Iterator : public UMemory {
00243 public:
00255 Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode);
00256
00268 Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
00269
00274 ~Iterator();
00275
00281 Iterator &reset();
00282
00287 UBool hasNext() const;
00288
00303 UBool next(UErrorCode &errorCode);
00304
00309 const StringPiece &getString() const { return sp_; }
00314 int32_t getValue() const { return value_; }
00315
00316 private:
00317 UBool truncateAndStop();
00318
00319 const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode);
00320
00321 const uint8_t *bytes_;
00322 const uint8_t *pos_;
00323 const uint8_t *initialPos_;
00324 int32_t remainingMatchLength_;
00325 int32_t initialRemainingMatchLength_;
00326
00327 CharString *str_;
00328 StringPiece sp_;
00329 int32_t maxLength_;
00330 int32_t value_;
00331
00332
00333
00334
00335
00336
00337
00338
00339 UVector32 *stack_;
00340 };
00341
00342 private:
00343 friend class BytesTrieBuilder;
00344
00351 BytesTrie(void *adoptBytes, const void *trieBytes)
00352 : ownedArray_(static_cast<uint8_t *>(adoptBytes)),
00353 bytes_(static_cast<const uint8_t *>(trieBytes)),
00354 pos_(bytes_), remainingMatchLength_(-1) {}
00355
00356
00357 BytesTrie &operator=(const BytesTrie &other);
00358
00359 inline void stop() {
00360 pos_=NULL;
00361 }
00362
00363
00364
00365 static int32_t readValue(const uint8_t *pos, int32_t leadByte);
00366 static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) {
00367
00368 if(leadByte>=(kMinTwoByteValueLead<<1)) {
00369 if(leadByte<(kMinThreeByteValueLead<<1)) {
00370 ++pos;
00371 } else if(leadByte<(kFourByteValueLead<<1)) {
00372 pos+=2;
00373 } else {
00374 pos+=3+((leadByte>>1)&1);
00375 }
00376 }
00377 return pos;
00378 }
00379 static inline const uint8_t *skipValue(const uint8_t *pos) {
00380 int32_t leadByte=*pos++;
00381 return skipValue(pos, leadByte);
00382 }
00383
00384
00385 static const uint8_t *jumpByDelta(const uint8_t *pos);
00386
00387 static inline const uint8_t *skipDelta(const uint8_t *pos) {
00388 int32_t delta=*pos++;
00389 if(delta>=kMinTwoByteDeltaLead) {
00390 if(delta<kMinThreeByteDeltaLead) {
00391 ++pos;
00392 } else if(delta<kFourByteDeltaLead) {
00393 pos+=2;
00394 } else {
00395 pos+=3+(delta&1);
00396 }
00397 }
00398 return pos;
00399 }
00400
00401 static inline UStringTrieResult valueResult(int32_t node) {
00402 return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node&kValueIsFinal));
00403 }
00404
00405
00406 UStringTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte);
00407
00408
00409 UStringTrieResult nextImpl(const uint8_t *pos, int32_t inByte);
00410
00411
00412
00413
00414 static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
00415 UBool haveUniqueValue, int32_t &uniqueValue);
00416
00417
00418 static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
00419
00420
00421
00422 static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out);
00423 static void append(ByteSink &out, int c);
00424
00425
00426
00427
00428
00429
00430
00431
00432
00433
00434
00435
00436
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446
00447
00448
00449
00450
00451
00452
00453
00454
00455
00456
00457
00458
00459
00460
00461
00462
00463
00464 static const int32_t kMaxBranchLinearSubNodeLength=5;
00465
00466
00467 static const int32_t kMinLinearMatch=0x10;
00468 static const int32_t kMaxLinearMatchLength=0x10;
00469
00470
00471
00472
00473
00474
00475 static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength;
00476
00477 static const int32_t kValueIsFinal=1;
00478
00479
00480 static const int32_t kMinOneByteValueLead=kMinValueLead/2;
00481 static const int32_t kMaxOneByteValue=0x40;
00482
00483 static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1;
00484 static const int32_t kMaxTwoByteValue=0x1aff;
00485
00486 static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1;
00487 static const int32_t kFourByteValueLead=0x7e;
00488
00489
00490 static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1;
00491
00492 static const int32_t kFiveByteValueLead=0x7f;
00493
00494
00495 static const int32_t kMaxOneByteDelta=0xbf;
00496 static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1;
00497 static const int32_t kMinThreeByteDeltaLead=0xf0;
00498 static const int32_t kFourByteDeltaLead=0xfe;
00499 static const int32_t kFiveByteDeltaLead=0xff;
00500
00501 static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1;
00502 static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1;
00503
00504 uint8_t *ownedArray_;
00505
00506
00507 const uint8_t *bytes_;
00508
00509
00510
00511
00512 const uint8_t *pos_;
00513
00514 int32_t remainingMatchLength_;
00515 };
00516
00517 U_NAMESPACE_END
00518
00519 #endif // __BYTESTRIE_H__