00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include "substringparser.hpp"
00021
00022 #include <iostream>
00023 #include <iomanip>
00024 #include <string>
00025
00026
00027
00028
00029
00030
00031 SubstringParser::SubstringParser()
00032 {
00033
00034
00035 SetDefaultParameters();
00036
00037 symbolcount=0;
00038 endofstreamsymbolnum=-1;
00039
00040 inputbufferlen=0;
00041 senteos=false;
00042 }
00043
00044 SubstringParser::~SubstringParser()
00045 {
00046
00047 FreeData();
00048 }
00049
00050
00051
00052
00053
00054
00055 bool SubstringParser::SaveState(bitwriter &to,bool fortempdecompressiononly)
00056 {
00057
00058
00059 bool bretv;
00060 bretv=SaveParameters(to);
00061 if (bretv)
00062 bretv=SaveSymbols(to);
00063 return bretv;
00064 }
00065
00066 bool SubstringParser::LoadState(bitreader &from)
00067 {
00068
00069
00070 bool bretv;
00071 bretv=LoadParameters(from);
00072 if (bretv)
00073 bretv=LoadSymbols(from);
00074 return bretv;
00075 }
00076
00077
00078 unsigned int SubstringParser::GetMemoryUsed()
00079 {
00080
00081 unsigned int memoryused=sizeof(this);
00082
00083
00084
00085 memoryused+=sizeof(symbolset);
00086
00087 for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
00088 {
00089 memoryused+=sizeof(*symbolset_pos);
00090 memoryused+=(*symbolset_pos)->get_memoryused();
00091 }
00092
00093 for (symbolvector_pos=symbolvector.begin();symbolvector_pos!=symbolvector.end();++symbolvector_pos)
00094 memoryused+=sizeof(*symbolvector_pos);
00095
00096 return memoryused;
00097 }
00098
00099
00100 unsigned int SubstringParser::GetDiskspaceUsed(bool fortempdecompressiononly)
00101 {
00102
00103
00104 unsigned int memoryused=0;
00105
00106
00107 memoryused+=1;
00108
00109
00110 for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
00111 {
00112
00113 memoryused+=1;
00114
00115 memoryused+=(unsigned int)(((*symbolset_pos)->get_valuep())->length());
00116
00117 memoryused+=sizeof(unsigned short int);
00118 }
00119
00120
00121 memoryused+=1;
00122 return memoryused;
00123 }
00124
00125
00126
00127
00128 void SubstringParser::ShowDebuggingInfo()
00129 {
00130
00131 using namespace std;
00132 string bitset_string;
00133 string value;
00134
00135
00136 std::cout << " Memory used by SubStringParser: " << GetMemoryUsed() << " bytes ("<< (unsigned int)(GetSetSymbolCount())<<" elements).\n";
00137
00138
00139 if (symbolset.size()>0)
00140 {
00141 cout << "Symbol Set - Sorted Alphabetically ("<<(unsigned int)(symbolset.size())<<" entries)\n";
00142 cout << " " << setiosflags(ios::left) << setw(Parameter_MaxSubstringSize) << "symboltext" << " " << setw(10) << "frequency\n";
00143 cout << " " << setiosflags(ios::left) << setw(Parameter_MaxSubstringSize) << "----------" << " " << setw(10) << "---------\n";
00144 for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
00145 {
00146 value=(*symbolset_pos)->get_value();
00147 NiceifyHighAsciiString(value);
00148 cout << " " << setiosflags(ios::left) << setw(Parameter_MaxSubstringSize) << value << " " << setw(10) << (*symbolset_pos)->get_weight() << "\n";
00149 }
00150 cout << "\n";
00151 }
00152 }
00153
00154
00155
00156
00157
00158
00159 string SubstringParser::GetParametersInformation()
00160 {
00161
00162
00163 string returnstring;
00164 cout << setiosflags(ios::left) << setw(17) << "maxbuildsymbols" << setiosflags(ios::left) << " " << setw(10) << Parameter_MaxSymbols_DuringBuild << " " << "parsing dictionary pruned to not exceed this size" << endl;
00165 cout << setiosflags(ios::left) << setw(17) << "maxsymbols" << setiosflags(ios::left) << " " << setw(10) << Parameter_MaxSymbols_Final << " " << "final dictionary pruned to reach this size" << endl;
00166 cout << setiosflags(ios::left) << setw(17) << "maxSubstring" << setiosflags(ios::left) << " " << setw(10) << Parameter_MaxSubstringSize << " " << "max size of Substrings in dictionary" << endl;
00167 cout << setiosflags(ios::left) << setw(17) << "onlywords" << setiosflags(ios::left) << " " << setw(10) << Parameter_OnlyCodeWholeWords << " " << "only add whole word symbols? (bool)" << endl;
00168 cout << setiosflags(ios::left) << setw(17) << "spanwords" << setiosflags(ios::left) << " " << setw(10) << Parameter_SpanWordBoundaries << " " << "encode symbols that span words? (bool)" << endl;
00169 cout << setiosflags(ios::left) << setw(17) << "crends" << setiosflags(ios::left) << " " << setw(10) << Parameter_CountCRsAsEOTs << " " << "count CRs as end-of-stream symbols? (bool)" << endl;
00170 cout << setiosflags(ios::left) << setw(17) << "minweight" << setiosflags(ios::left) << " " << setw(10) << Parameter_PruneMinimumWeight << " " << "minimum threshold weight for final pruning" << endl;
00171 cout << setiosflags(ios::left) << setw(17) << "smartlookup" << setiosflags(ios::left) << " " << setw(10) << Parameter_UseSmartLookup << " " << "when true, use lower_bound stl dictionary search" << endl;
00172 cout << setiosflags(ios::left) << setw(17) << "parsemode" << setiosflags(ios::left) << " " << setw(10) << Parameter_ParseMode << " " << "parse heuristic (0=greedy 1=lff 2=deep&slow)" << endl;
00173 cout << setiosflags(ios::left) << setw(17) << "recalculations" << setiosflags(ios::left) << " " << setw(10) << Parameter_PruneReCalculations << " " << "recalculate symbol costs during pruning?" << endl;
00174 return returnstring;
00175 }
00176
00177
00178 bool SubstringParser::SetParameter(const std::string ¶metername,const std::string ¶metervalue)
00179 {
00180
00181
00182
00183 bool bretv=false;
00184 if (parametername=="maxbuildsymbols")
00185 bretv=ParseParameter(parametervalue,Parameter_MaxSymbols_DuringBuild);
00186 else if (parametername=="maxsymbols")
00187 bretv=ParseParameter(parametervalue,Parameter_MaxSymbols_Final);
00188 else if (parametername=="maxSubstring")
00189 bretv=ParseParameter(parametervalue,Parameter_MaxSubstringSize);
00190 else if (parametername=="onlywords")
00191 bretv=ParseParameter(parametervalue,Parameter_OnlyCodeWholeWords);
00192 else if (parametername=="spanwords")
00193 bretv=ParseParameter(parametervalue,Parameter_SpanWordBoundaries);
00194 else if (parametername=="crends")
00195 bretv=ParseParameter(parametervalue,Parameter_CountCRsAsEOTs);
00196 else if (parametername=="minweight")
00197 bretv=ParseParameter(parametervalue,Parameter_PruneMinimumWeight);
00198 else if (parametername=="smartlookup")
00199 bretv=ParseParameter(parametervalue,Parameter_UseSmartLookup);
00200 else if (parametername=="parsemode")
00201 bretv=ParseParameter(parametervalue,Parameter_ParseMode);
00202 else if (parametername=="recalculations")
00203 bretv=ParseParameter(parametervalue,Parameter_PruneReCalculations);
00204 return bretv;
00205 }
00206
00207
00208 void SubstringParser::SetDefaultParameters()
00209 {
00210
00211
00212
00213
00214 Parameter_MaxSymbols_DuringBuild=200000;
00215
00216
00217
00218 Parameter_MaxSymbols_Final=2000;
00219
00220
00221 Parameter_MaxSubstringSize=12;
00222
00223
00224
00225
00226
00227 Parameter_OnlyCodeWholeWords=false;
00228
00229
00230 Parameter_SpanWordBoundaries=false;
00231
00232
00233 Parameter_PruneMinimumWeight=10;
00234
00235
00236
00237
00238
00239
00240 Parameter_CountCRsAsEOTs=true;
00241
00242
00243 Parameter_UseSmartLookup=true;
00244
00245
00246 Parameter_ParseMode=Dictionary_ParseMode_GREEDY;
00247
00248
00249 Parameter_PruneReCalculations=true;
00250 }
00251
00252
00253
00254
00255
00256
00257
00258 bool SubstringParser::RewindAnyBufferedInput(bitreader &from)
00259 {
00260
00261
00262
00263
00264
00265
00266
00267
00268 from.seek_bit(from.tell_bit()-(inputbufferlen*8));
00269 return true;
00270 }
00271
00272 void SubstringParser::SynchronizeStateForNewStream()
00273 {
00274
00275
00276
00277 inputbufferlen=0;
00278 senteos=false;
00279 }
00280
00281 bool SubstringParser::CreateSymbolSetUsingStream(bitreader &from)
00282 {
00283
00284 string valuestring;
00285 string slidingwindowstring;
00286 unsigned char c;
00287 bool slidingwindowfull=false;
00288 string eotstring=string("");
00289
00290
00291 FreeData_Symbols();
00292
00293 AddPrimitiveCharacterSubstringSymbols();
00294
00295
00296 current_bitreaderp = &from;
00297 start_bitreaderposition = from.tell_bit();
00298
00299
00300 while (!from.empty())
00301 {
00302
00303 c=from.get_byte();
00304
00305
00306 slidingwindowstring+=c;
00307
00308
00309 if (!slidingwindowfull)
00310 {
00311
00312 if ((int)(slidingwindowstring.length())>Parameter_MaxSubstringSize+5)
00313 {
00314
00315 slidingwindowfull=true;
00316 }
00317 }
00318 if (slidingwindowfull)
00319 {
00320
00321 slidingwindowstring.erase(0,1);
00322 }
00323
00324
00325 AddSubstringsFromSlidingWindowString(slidingwindowstring);
00326
00327
00328
00329
00330
00331 if (c==13 && Parameter_CountCRsAsEOTs)
00332 {
00333
00334 UpdateValueStringInSymbolSet(eotstring,1);
00335 }
00336
00337 if (symbolset.size()>Parameter_MaxSymbols_DuringBuild)
00338 {
00339
00340
00341
00342 PruneSymbolSet_ReduceSizeTo(((unsigned int)(symbolset.size())/5)+1);
00343 }
00344 }
00345
00346
00347 PruneSymbolSet();
00348
00349
00350 BuildSymbolVector();
00351
00352
00353 return true;
00354 }
00355
00356
00357
00358
00359
00360
00361
00362 bool SubstringParser::ParseNextSymbolFromInput(bitreader &from, int &symbolnum)
00363 {
00364
00365
00366 unsigned char c;
00367 int maxSubstringlen=Parameter_MaxSubstringSize;
00368 SubstringSymbol *symbolp;
00369
00370
00371
00372 current_bitreaderp=&from;
00373 while (inputbufferlen<maxSubstringlen && !from.empty())
00374 {
00375
00376 c=from.get_byte();
00377 inputbufferstr[inputbufferlen]=c;
00378
00379 ++inputbufferlen;
00380 }
00381
00382 if (inputbufferlen==0)
00383 {
00384
00385 if (senteos)
00386 {
00387
00388 return false;
00389 }
00390 else
00391 {
00392
00393 senteos=true;
00394 }
00395 }
00396
00397
00398
00399
00400 symbolp=FindNextSymbolToEncode(inputbufferstr,inputbufferlen);
00401
00402
00403 symbolnum=symbolp->get_symbolvectorpos();
00404 inputbufferlen=SwallowSymbolFromInputQueueStr(symbolp,inputbufferstr,inputbufferlen,false);
00405 return true;
00406 }
00407
00408
00409 bool SubstringParser::WriteSymbolText(bitwriter &to, int symbolnum,bool &isendofstreamsymbol)
00410 {
00411
00412
00413
00414
00415 SubstringSymbol *symbolp=symbolvector[symbolnum];
00416 int valuelength=(int)((symbolp->get_valuep())->length());
00417 to.write((symbolp->get_valuep())->c_str(),valuelength);
00418
00419
00420 isendofstreamsymbol=(symbolnum==endofstreamsymbolnum);
00421
00422
00423 return true;
00424 }
00425
00426
00427
00428
00429
00430
00431
00432
00433
00434
00435
00436
00437
00438
00439
00440
00441
00442
00443 void SubstringParser::FreeData()
00444 {
00445
00446 FreeData_Symbols();
00447 }
00448
00449 void SubstringParser::FreeData_Symbols()
00450 {
00451
00452
00453
00454 for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
00455 {
00456
00457 delete (*symbolset_pos);
00458 }
00459
00460
00461 symbolset.clear();
00462
00463
00464 BuildSymbolVector();
00465 }
00466
00467
00468
00469
00470
00471
00472 bool SubstringParser::SaveParameters(bitwriter &to)
00473 {
00474
00475 to.put(Parameter_MaxSymbols_DuringBuild);
00476 to.put(Parameter_MaxSymbols_Final);
00477 to.put(Parameter_MaxSubstringSize);
00478 to.put(Parameter_OnlyCodeWholeWords);
00479 to.put(Parameter_SpanWordBoundaries);
00480 to.put(Parameter_PruneMinimumWeight);
00481 to.put(Parameter_CountCRsAsEOTs);
00482 to.put(Parameter_UseSmartLookup);
00483 to.put(Parameter_ParseMode);
00484 to.put(Parameter_PruneReCalculations);
00485 return true;
00486 }
00487
00488
00489 bool SubstringParser::LoadParameters(bitreader &from)
00490 {
00491
00492 from.get(Parameter_MaxSymbols_DuringBuild);
00493 from.get(Parameter_MaxSymbols_Final);
00494 from.get(Parameter_MaxSubstringSize);
00495 from.get(Parameter_OnlyCodeWholeWords);
00496 from.get(Parameter_SpanWordBoundaries);
00497 from.get(Parameter_PruneMinimumWeight);
00498 from.get(Parameter_CountCRsAsEOTs);
00499 from.get(Parameter_UseSmartLookup);
00500 from.get(Parameter_ParseMode);
00501 from.get(Parameter_PruneReCalculations);
00502 return true;
00503 }
00504
00505
00506 bool SubstringParser::SaveSymbols(bitwriter &to)
00507 {
00508
00509
00510 unsigned char valuelength;
00511
00512
00513 for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
00514 WriteSubstringSymbol((*symbolset_pos),to);
00515
00516
00517
00518 valuelength=255;
00519 to.put(valuelength);
00520
00521
00522 return true;
00523 }
00524
00525
00526 bool SubstringParser::LoadSymbols(bitreader &from)
00527 {
00528
00529 bool bretv=true;
00530 bool bretv2;
00531
00532
00533 while (!from.empty())
00534 {
00535
00536 bretv2=ReadSubstringSymbol(from);
00537 if (!bretv2)
00538 break;
00539 }
00540
00541
00542 BuildSymbolVector();
00543
00544
00545 return bretv;
00546 }
00547
00548
00549
00550
00551
00552 bool SubstringParser::WriteSubstringSymbol(SubstringSymbol *SubstringSymbolp,bitwriter &to)
00553 {
00554
00555
00556 TSubStrParserWeight weightvalue;
00557 unsigned int weightvalue_uint;
00558 unsigned char valuelength;
00559
00560
00561 valuelength=(int)(((*symbolset_pos)->get_valuep())->length());
00562 to.put(valuelength);
00563
00564 to.write(((*symbolset_pos)->get_valuep())->c_str(),valuelength);
00565
00566 weightvalue=(*symbolset_pos)->get_weight();
00567
00568 weightvalue_uint=static_cast<unsigned int>(weightvalue);
00569 to.put(weightvalue_uint);
00570
00571
00572 return true;
00573 }
00574
00575 bool SubstringParser::ReadSubstringSymbol(bitreader &from)
00576 {
00577
00578
00579 unsigned int weightvalue_uint;
00580 unsigned char valuelength;
00581 char valuestr[256];
00582 std::string valuestring;
00583
00584 from.get(valuelength);
00585 if (valuelength>=255)
00586 {
00587
00588 return false;
00589 }
00590
00591 from.read(valuestr,valuelength);
00592 valuestring=string(valuestr,valuelength);
00593
00594 from.get(weightvalue_uint);
00595
00596 AddSymbol(valuestring,(TSubStrParserWeight)weightvalue_uint);
00597 return true;
00598 }
00599
00600
00601
00602
00603
00604
00605
00606 void SubstringParser::AddPrimitiveCharacterSubstringSymbols()
00607 {
00608
00609 string valuestring;
00610 char valuestr[20];
00611
00612 for ( int i = 0 ; i < 256 ; i++ )
00613 {
00614
00615 valuestr[0]=i;valuestr[1]='\0';
00616 valuestring=string(valuestr,1);
00617
00618 UpdateValueStringInSymbolSet(valuestring,1);
00619 }
00620
00621
00622 valuestring="";
00623 UpdateValueStringInSymbolSet(valuestring,1);
00624 }
00625
00626
00627 bool SubstringParser::UpdateValueStringInSymbolSet(const string &stringvalue,int increment)
00628 {
00629
00630
00631 bool bretv=true;
00632
00633
00634 static SubstringSymbol searchSubstringSymbol("",1);
00635
00636 searchSubstringSymbol.set_value(stringvalue);
00637
00638
00639 symbolset_pos=symbolset.find(&searchSubstringSymbol);
00640
00641 if (symbolset_pos==symbolset.end())
00642 {
00643
00644 AddSymbol(stringvalue,increment);
00645 }
00646 else
00647 {
00648
00649 (*symbolset_pos)->increment_weight(increment);
00650 }
00651
00652
00653 return bretv;
00654 }
00655
00656 void SubstringParser::ResetWeights()
00657 {
00658
00659 for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
00660 (*symbolset_pos)->set_weight(0);
00661 }
00662
00663
00664
00665
00666
00667
00668
00669 void SubstringParser::AddSubstringsFromSlidingWindowString(string &slidingwindowstring)
00670 {
00671
00672
00673
00674
00675
00676
00677
00678
00679 string valuestring;
00680 bool bretv;
00681 int len=(int)(slidingwindowstring.length());
00682 unsigned char c,c2;
00683 int leftpos,rightpos,curpos;
00684 bool delimiteronborder;
00685
00686
00687 valuestring=slidingwindowstring.substr(len-1,1);
00688 bretv=UpdateValueStringInSymbolSet(valuestring,1);
00689
00690
00691 if (len==1)
00692 return;
00693
00694
00695 delimiteronborder=false;
00696
00697 if (Parameter_OnlyCodeWholeWords)
00698 {
00699
00700 c=slidingwindowstring[len-1];
00701 c2=slidingwindowstring[len-2];
00702 if (len<3)
00703 {
00704
00705 return;
00706 }
00707 if (!IsNonWordCharacter(c) && !IsNonWordCharacter(c2))
00708 {
00709
00710 return;
00711 }
00712 if (IsNonWordCharacter(c) && IsNonWordCharacter(c2))
00713 {
00714
00715 return;
00716 }
00717
00718 rightpos=len-2;
00719 }
00720 else
00721 {
00722
00723 rightpos=len-1;
00724 }
00725
00726
00727 if (IsNonWordCharacter(slidingwindowstring[rightpos]))
00728 {
00729
00730
00731
00732 delimiteronborder=true;
00733 }
00734 else
00735 delimiteronborder=false;
00736
00737
00738 leftpos=rightpos-Parameter_MaxSubstringSize;
00739 if (leftpos<0)
00740 leftpos=0;
00741
00742
00743
00744 for (curpos=rightpos-1;curpos>leftpos;--curpos)
00745 {
00746 c=slidingwindowstring[curpos];
00747 if (delimiteronborder && IsNonWordCharacter(c))
00748 {
00749 if (Parameter_OnlyCodeWholeWords && curpos>leftpos)
00750 {
00751
00752 continue;
00753 }
00754
00755 }
00756 else if (IsNonWordCharacter(c) || delimiteronborder)
00757 {
00758 if (IsNonWordCharacter(c) && IsNonWordCharacter(slidingwindowstring[curpos+1]))
00759 {
00760
00761
00762 continue;
00763 }
00764 if (Parameter_OnlyCodeWholeWords || delimiteronborder)
00765 {
00766
00767 valuestring=slidingwindowstring.substr(curpos+1,rightpos-curpos);
00768
00769 bretv=UpdateValueStringInSymbolSet(valuestring,1);
00770 if (delimiteronborder)
00771 {
00772
00773 break;
00774 }
00775 else if (!Parameter_SpanWordBoundaries)
00776 {
00777
00778 break;
00779 }
00780 else
00781 {
00782
00783 continue;
00784 }
00785 }
00786 else if (!Parameter_SpanWordBoundaries)
00787 {
00788
00789 break;
00790 }
00791 else if (IsNonWordCharacter(c) && !delimiteronborder)
00792 {
00793
00794
00795 continue;
00796 }
00797 }
00798 else if (delimiteronborder)
00799 {
00800
00801 break;
00802 }
00803 else if (Parameter_OnlyCodeWholeWords)
00804 {
00805
00806 continue;
00807 }
00808
00809
00810 valuestring=slidingwindowstring.substr(curpos,(rightpos-curpos)+1);
00811 bretv=UpdateValueStringInSymbolSet(valuestring,1);
00812 }
00813 }
00814
00815
00816
00817
00818
00819
00820
00821
00822 int SubstringParser::SwallowSymbolFromInputQueueStr(SubstringSymbol *symbolp,char *inputqueuestr,int inputqueuestrlen,bool debugmode)
00823 {
00824
00825 if (symbolp==NULL)
00826 {
00827
00828 return -1;
00829 }
00830
00831
00832 if (debugmode)
00833 cout << symbolp->get_value() << "*";
00834
00835
00836 int writepos=0;
00837 for (int pos=(int)((symbolp->get_valuep())->length());pos<inputqueuestrlen;++pos)
00838 {
00839 inputqueuestr[writepos]=inputqueuestr[pos];
00840 ++writepos;
00841 }
00842 inputqueuestrlen=writepos;
00843
00844
00845 return inputqueuestrlen;
00846 }
00847
00848
00849 SubstringSymbol* SubstringParser::FindNextSymbolToEncode(char *inputqueuestr,int inputqueuestrlen)
00850 {
00851
00852
00853
00854 SubstringSymbol *symbolnodep = 0;
00855
00856 if (Parameter_ParseMode==Dictionary_ParseMode_GREEDY || inputqueuestrlen<=2)
00857 {
00858
00859
00860
00861 symbolnodep = FindNextSymbolToEncode_Greedy(inputqueuestr,inputqueuestrlen);
00862 }
00863 else if (Parameter_ParseMode==Dictionary_ParseMode_LFF)
00864 {
00865
00866 int bestindex=inputqueuestrlen;
00867 int bestscore=-1;
00868 for (int index=0;index<=inputqueuestrlen-1;++index)
00869 {
00870
00871
00872 if (bestscore>=inputqueuestrlen-index)
00873 break;
00874 symbolnodep = FindNextSymbolToEncode_Greedy(&inputqueuestr[index],inputqueuestrlen-index);
00875 if (symbolnodep==NULL)
00876 break;
00877 if (bestscore==-1 || symbolnodep->get_valuelen()>bestscore)
00878 {
00879 bestscore=symbolnodep->get_valuelen();
00880 bestindex=index;
00881 }
00882 }
00883
00884 if (bestindex==0)
00885 bestindex=inputqueuestrlen;
00886 symbolnodep = FindNextSymbolToEncode_Greedy(inputqueuestr,bestindex);
00887 }
00888 else if (Parameter_ParseMode==Dictionary_ParseMode_DEEP)
00889 {
00890
00891 double encodecost;
00892 double bestscore;
00893 int longestlen=inputqueuestrlen;
00894 int bestindex=longestlen;
00895 bool firstscore=true;
00896 do
00897 {
00898 bestscore=-1;
00899 longestlen=bestindex;
00900 for (int index=1;index<=longestlen;++index)
00901 {
00902
00903 encodecost=CalculateEncodeCost(inputqueuestr,index);
00904 if (index<longestlen)
00905 encodecost+=CalculateEncodeCost(&inputqueuestr[index],longestlen-index);
00906
00907 if (firstscore || encodecost<=bestscore)
00908 {
00909 bestscore=encodecost;
00910 bestindex=index;
00911 firstscore=false;
00912 }
00913 }
00914 } while (bestindex!=longestlen);
00915
00916 symbolnodep = FindNextSymbolToEncode_Greedy(inputqueuestr,bestindex);
00917 }
00918 else
00919 {
00920
00921 cout << "ERROR: unknown parse mode "<<Parameter_ParseMode<<"."<<endl;
00922 }
00923
00924 return symbolnodep;
00925 }
00926
00927
00928 SubstringSymbol* SubstringParser::FindNextSymbolToEncode_Greedy(char *inputqueuestr,int inputqueuestrlen)
00929 {
00930
00931
00932
00933
00934
00935
00936
00937 int originputqueuestrlen=inputqueuestrlen;
00938 string inputstring;
00939
00940
00941 static SubstringSymbol searchSubstringSymbol("",1);
00942
00943
00944 if (inputqueuestrlen==0)
00945 inputstring="";
00946 else
00947 inputstring=string(inputqueuestr,inputqueuestrlen);
00948
00949
00950 if (inputqueuestrlen>Parameter_MaxSubstringSize)
00951 {
00952 inputqueuestrlen=Parameter_MaxSubstringSize;
00953 inputstring=inputstring.substr(0,inputqueuestrlen);
00954 }
00955
00956
00957 searchSubstringSymbol.set_value(inputstring);
00958
00959
00960 if (Parameter_UseSmartLookup)
00961 {
00962
00963
00964 int len;
00965 int index;
00966 string teststring;
00967 for (;;)
00968 {
00969 symbolset_pos=symbolset.lower_bound(&searchSubstringSymbol);
00970
00971 if (symbolset_pos==symbolset.end())
00972 {
00973
00974 --symbolset_pos;
00975
00976 inputqueuestrlen=1;
00977 inputstring=inputstring.substr(0,inputqueuestrlen);
00978 teststring=(*symbolset_pos)->get_value();
00979 if (inputstring[0]!=teststring[0])
00980 {
00981 cout << "ERROR: Unexpectedly didn't find any match for a string that started out "<<originputqueuestrlen<<" characters (we hit "<<inputqueuestrlen<<"). last char was "<<(int)((unsigned char)(inputqueuestr[0]))<<"."<<endl;
00982 return NULL;
00983 }
00984
00985 searchSubstringSymbol.set_value(inputstring);
00986 break;
00987 }
00988
00989
00990
00991
00992
00993
00994 teststring=(*symbolset_pos)->get_value();
00995 len=(int)(teststring.length());
00996 if (teststring==inputstring)
00997 break;
00998
00999
01000 --symbolset_pos;
01001 teststring=(*symbolset_pos)->get_value();
01002 len=(int)(teststring.length());
01003 if (len<inputqueuestrlen)
01004 {
01005
01006 inputqueuestrlen=len;
01007 inputstring=inputstring.substr(0,inputqueuestrlen);
01008 }
01009
01010 if (teststring==inputstring)
01011 break;
01012
01013
01014 len=(int)(teststring.length());
01015 for (index=0;index<len;++index)
01016 {
01017 if (teststring[index]!=inputstring[index])
01018 break;
01019 }
01020
01021
01022 inputqueuestrlen=index;
01023 if (inputqueuestrlen==0)
01024 {
01025 cout << "ERROR: unexpectedly truncated search string that started out "<<originputqueuestrlen<<" characters (we hit "<<inputqueuestrlen<<"). last char was "<<(int)inputqueuestr[0]<<"."<<endl;
01026 inputstring="";
01027 }
01028 else
01029 inputstring=inputstring.substr(0,inputqueuestrlen);
01030 searchSubstringSymbol.set_value(inputstring);
01031 }
01032 }
01033 else
01034 {
01035
01036 for(;;)
01037 {
01038 symbolset_pos=symbolset.find(&searchSubstringSymbol);
01039 if (symbolset_pos!=symbolset.end())
01040 {
01041
01042 break;
01043 }
01044
01045 --inputqueuestrlen;
01046 if (inputqueuestrlen<=0)
01047 {
01048 cout << "ERROR: unexpectedly didn't find any match for a string that started out with "<<originputqueuestrlen<<" characters (we hit "<<inputqueuestrlen<<"). last char was "<<(int)inputqueuestr[0]<<"."<<endl;
01049 return NULL;
01050 }
01051
01052 inputstring=inputstring.substr(0,inputqueuestrlen);
01053 searchSubstringSymbol.set_value(inputstring);
01054 }
01055 }
01056
01057
01058 return *symbolset_pos;
01059 }
01060
01061
01062 double SubstringParser::CalculateEncodeCost(char *inputqueuestr,int inputqueuestrlen)
01063 {
01064
01065
01066 SubstringSymbol* symbolnodep;
01067 double cost=0;
01068
01069
01070 symbolnodep = FindNextSymbolToEncode_Greedy(inputqueuestr,inputqueuestrlen);
01071 if (symbolnodep==NULL)
01072 {
01073
01074 cout << "$no match found."<<endl;
01075 return 99999;
01076 }
01077
01078
01079 cost+=GetSymbolCost(symbolnodep);
01080
01081
01082 inputqueuestr+=symbolnodep->get_valuelen();
01083 inputqueuestrlen-=symbolnodep->get_valuelen();
01084
01085
01086 if (Parameter_ParseMode!=Dictionary_ParseMode_GREEDY && inputqueuestrlen>2)
01087 {
01088
01089 int bestindex=inputqueuestrlen;
01090 double bestscore=0;
01091 double encodecost;
01092 bool firstscore=true;
01093 for (int index=1;index<=inputqueuestrlen;++index)
01094 {
01095
01096 encodecost=CalculateEncodeCost(inputqueuestr,index);
01097 if (index<inputqueuestrlen)
01098 encodecost+=CalculateEncodeCost(&inputqueuestr[index],inputqueuestrlen-index);
01099
01100 if (firstscore || encodecost<bestscore)
01101 {
01102 bestscore=encodecost;
01103 bestindex=index;
01104 firstscore=false;
01105 }
01106 }
01107
01108 cost+=bestscore;
01109 }
01110 else if (inputqueuestrlen>0)
01111 {
01112
01113 cost+=CalculateEncodeCost(inputqueuestr,inputqueuestrlen);
01114 }
01115 else
01116 {
01117
01118 }
01119
01120
01121 return cost;
01122 }
01123
01124
01125
01126
01127
01128
01129
01130
01131 void SubstringParser::PruneSymbolSet_WeightThreshold(TSubStrParserWeight thresholdweight)
01132 {
01133
01134
01135 TSymbolSetTYPE::iterator nextsymbolpos;
01136 TSubStrParserWeight weight;
01137 int index=0;
01138
01139
01140 for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();)
01141 {
01142 ++index;
01143 nextsymbolpos=symbolset_pos;
01144 ++nextsymbolpos;
01145
01146 if (!((*symbolset_pos)->dontprune()))
01147 {
01148 weight=(*symbolset_pos)->get_weight();
01149 if (weight<thresholdweight)
01150 {
01151
01152 delete (*symbolset_pos);
01153 symbolset.erase(symbolset_pos);
01154 }
01155 }
01156
01157 symbolset_pos=nextsymbolpos;
01158 }
01159 }
01160
01161
01162 void SubstringParser::PruneSymbolSet_RemoveRedundantPrefixes()
01163 {
01164
01165
01166
01167 TSymbolSetTYPE::iterator nextsymbolpos;
01168 TSymbolSetTYPE::iterator lastsymbolpos;
01169 TSubStrParserWeight weight,lastweight;
01170 string lastvalue;
01171 int lastvaluelen;
01172
01173 lastweight=0;
01174 for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();)
01175 {
01176 nextsymbolpos=symbolset_pos;
01177 ++nextsymbolpos;
01178
01179 if ((*symbolset_pos)->dontprune())
01180 lastweight=0;
01181 else
01182 {
01183 weight=(*symbolset_pos)->get_weight();
01184 if (weight==lastweight && weight!=0)
01185 {
01186
01187 lastvaluelen=(int)(lastvalue.length());
01188 if (((*symbolset_pos)->get_value()).compare(0,lastvaluelen,lastvalue.c_str(),lastvaluelen)==0)
01189 {
01190
01191 delete *lastsymbolpos;
01192 symbolset.erase(lastsymbolpos);
01193 }
01194 }
01195
01196 lastweight=weight;
01197 lastvalue=(*symbolset_pos)->get_value();
01198 lastsymbolpos=symbolset_pos;
01199 }
01200
01201 symbolset_pos=nextsymbolpos;
01202 }
01203 }
01204
01205
01206 void SubstringParser::PruneSymbolSet_ReduceSizeTo(unsigned int targetsize)
01207 {
01208
01209
01210
01211 if (symbolset.size()<targetsize)
01212 return;
01213
01214 for (int count=2;count<10000;++count)
01215 {
01216
01217 PruneSymbolSet_WeightThreshold((TSubStrParserWeight)count);
01218 if (symbolset.size()<=targetsize)
01219 break;
01220 }
01221 }
01222
01223
01224 void SubstringParser::PruneSymbolSet()
01225 {
01226
01227
01228
01229
01230
01231
01232
01233
01234 cout << endl << "Total symbols in dictionary before pruning: " << GetSetSymbolCount() << endl;
01235
01236
01237 PruneSymbolSet_WeightThreshold(Parameter_PruneMinimumWeight);
01238 cout << "After Stage 1 Pruning (removed scores below "<<Parameter_PruneMinimumWeight<<"), symbols: " << GetSetSymbolCount() << endl;
01239
01240
01241 PruneSymbolSet_RemoveRedundantPrefixes();
01242 cout << "After Stage 2 Pruning (removed redundant prefixes), symbols: " << GetSetSymbolCount() << endl;
01243
01244
01245 if (Parameter_PruneReCalculations)
01246 {
01247
01248 ReParseFileReComputeCosts(true);
01249 PruneSymbolSet_WeightThreshold(Parameter_PruneMinimumWeight);
01250 PruneSymbolSet_RemoveRedundantPrefixes();
01251
01252 cout << "After pre-ceil pre-compression, re-evaluation, and repruning, symbols: " << GetSetSymbolCount() << endl;
01253 }
01254
01255
01256 PruneSymbolSet_ReduceSizeTo(Parameter_MaxSymbols_Final);
01257 cout << "After Stage 3 Pruning (killed worst scores to reach target), symbols: " << GetSetSymbolCount() << endl;
01258
01259
01260 if (Parameter_PruneReCalculations)
01261 {
01262
01263 ReParseFileReComputeCosts(true);
01264 PruneSymbolSet_WeightThreshold(Parameter_PruneMinimumWeight);
01265 PruneSymbolSet_RemoveRedundantPrefixes();
01266 cout << "After re-compression, re-evaluation, and repruning, symbols: " << GetSetSymbolCount() << endl;
01267 }
01268 }
01269
01270
01271
01272
01273
01274
01275
01276
01277 double SubstringParser::ReParseFileReComputeCosts(bool updateweights)
01278 {
01279
01280
01281
01282
01283
01284
01285
01286 BuildSymbolVector();
01287
01288 int symbolnum;
01289 SubstringSymbol *symbolp;
01290 double totalcost=0;
01291
01292
01293 current_bitreaderp->seek_bit(start_bitreaderposition);
01294
01295
01296 if (updateweights)
01297 ResetWeights();
01298
01299
01300
01301 while (ParseNextSymbolFromInput(*current_bitreaderp,symbolnum))
01302 {
01303 symbolp=symbolvector[symbolnum];
01304 if (updateweights)
01305 {
01306
01307 if (updateweights)
01308 {
01309
01310
01311 symbolp->increment_weight(1);
01312
01313 if (Parameter_CountCRsAsEOTs && strchr(symbolp->get_valuep()->c_str(),13)!=NULL)
01314 {
01315 symbolp=FindNextSymbolToEncode_Greedy("",0);
01316 if (symbolp!=NULL)
01317 symbolp->increment_weight(1);
01318 }
01319 }
01320 }
01321 else
01322 totalcost+=symbolp->get_cost();
01323 }
01324
01325
01326 return totalcost;
01327 }
01328
01329
01330
01331
01332
01333
01334 void SubstringParser::BuildSymbolVector()
01335 {
01336
01337
01338 symbolvector.clear();
01339
01340 int index=0;
01341 for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
01342 {
01343
01344 symbolvector.push_back(*symbolset_pos);
01345
01346 (*symbolset_pos)->set_symbolvectorpos(index++);
01347 }
01348
01349 symbolcount=(int)(symbolvector.size());
01350
01351 endofstreamsymbolnum=LookupSymbolNumberFromText(string(""));
01352 }
01353
01354
01355 int SubstringParser::LookupSymbolNumberFromText(string wordstring)
01356 {
01357
01358 static SubstringSymbol searchSubstringSymbol("",1);
01359
01360 searchSubstringSymbol.set_value(wordstring);
01361 symbolset_pos=symbolset.find(&searchSubstringSymbol);
01362
01363 if (symbolset_pos==symbolset.end())
01364 return -1;
01365
01366 return ((*symbolset_pos)->get_symbolvectorpos());
01367 }
01368