Octane: parsers/parser_substring/substringparser.cpp Source File

00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 //---------------------------------------------------------------------------
00017 
00018 //---------------------------------------------------------------------------
00019 // Application includes
00020 #include "substringparser.hpp"
00021 // System includes
00022 #include <iostream>
00023 #include <iomanip>
00024 #include <string>
00025 //---------------------------------------------------------------------------
00026 
00027 
00028 
00029 
00030 //---------------------------------------------------------------------------
00031 SubstringParser::SubstringParser()
00032 {
00033         // constructor
00034         // initialize default dictionary-building parameters
00035         SetDefaultParameters();
00036         // initialize our current symbol count and end-of-stream-symbol
00037         symbolcount=0;
00038         endofstreamsymbolnum=-1;
00039         // initialize input buffer pos
00040         inputbufferlen=0;
00041         senteos=false;
00042 }
00043 
00044 SubstringParser::~SubstringParser()
00045 {
00046         // destructor frees the SubstringSymbols in symbolset
00047         FreeData();
00048 }
00049 //---------------------------------------------------------------------------
00050 
00051 
00052 
00053 //---------------------------------------------------------------------------
00054 // OCTANE PUBLIC API - AUXILIARY FUNCTIONS
00055 bool SubstringParser::SaveState(bitwriter &to,bool fortempdecompressiononly)
00056 {
00057         // save state
00058         // return true on success
00059         bool bretv;
00060         bretv=SaveParameters(to);
00061         if (bretv)
00062                 bretv=SaveSymbols(to);
00063         return bretv;
00064 }
00065 
00066 bool SubstringParser::LoadState(bitreader &from)
00067 {
00068         // load state
00069         // return true on success
00070         bool bretv;
00071         bretv=LoadParameters(from);
00072         if (bretv)
00073                 bretv=LoadSymbols(from);
00074         return bretv;
00075 }
00076 
00077 
00078 unsigned int SubstringParser::GetMemoryUsed()
00079         {
00080         // return memory used by this data structure
00081         unsigned int memoryused=sizeof(this);
00082 
00083         // this returns the size of the symbolset, as it would take up on disk if saved in straightforward way
00084         //  memory usage will be almost identical.
00085         memoryused+=sizeof(symbolset);
00086         // write entries in compact binary form
00087         for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
00088                 {
00089                 memoryused+=sizeof(*symbolset_pos);
00090                 memoryused+=(*symbolset_pos)->get_memoryused();
00091                 }
00092         // for vector
00093         for (symbolvector_pos=symbolvector.begin();symbolvector_pos!=symbolvector.end();++symbolvector_pos)
00094                 memoryused+=sizeof(*symbolvector_pos);
00095         
00096         return memoryused;
00097         }
00098 
00099 
00100 unsigned int SubstringParser::GetDiskspaceUsed(bool fortempdecompressiononly)
00101 {
00102         // this returns the size of the symbolset, as it would take up on disk if saved in straightforward way
00103         //  memory usage will be almost identical.
00104         unsigned int memoryused=0;
00105 
00106         // byte for file type (see WriteSymbolSet routine)
00107         memoryused+=1;
00108         
00109         // write entries in compact binary form
00110         for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
00111                 {
00112                 // length of the string
00113                 memoryused+=1;
00114                 // write string
00115                 memoryused+=(unsigned int)(((*symbolset_pos)->get_valuep())->length());
00116                 // weight of symbol
00117                 memoryused+=sizeof(unsigned short int);
00118                 }
00119 
00120         // end-of-header
00121         memoryused+=1;
00122         return memoryused;
00123 }
00124 
00125 
00126 
00127 
00128 void SubstringParser::ShowDebuggingInfo()
00129 {
00130         // show some debugging info
00131         using namespace std;
00132         string bitset_string;
00133         string value;
00134 
00135         // size of tree is
00136         std::cout << " Memory used by SubStringParser: " << GetMemoryUsed() << " bytes ("<< (unsigned int)(GetSetSymbolCount())<<" elements).\n";
00137         
00138         // show the dictionary sorted alphabetically
00139         if (symbolset.size()>0)
00140                 {
00141                 cout << "Symbol Set - Sorted Alphabetically ("<<(unsigned int)(symbolset.size())<<" entries)\n";
00142                 cout << " " << setiosflags(ios::left) << setw(Parameter_MaxSubstringSize) << "symboltext" << " " << setw(10) << "frequency\n";
00143                 cout << " " << setiosflags(ios::left) << setw(Parameter_MaxSubstringSize) << "----------" << " " << setw(10) << "---------\n";
00144                 for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
00145                         {
00146                         value=(*symbolset_pos)->get_value();
00147                         NiceifyHighAsciiString(value);
00148                         cout << " " << setiosflags(ios::left) << setw(Parameter_MaxSubstringSize) << value << " " << setw(10) << (*symbolset_pos)->get_weight() << "\n";
00149                         }
00150                 cout << "\n";
00151                 }
00152 }
00153 //---------------------------------------------------------------------------
00154 
00155 
00156 //---------------------------------------------------------------------------
00157 // OCTANE PUBLIC API - AUXILIARY FUNCTIONS
00158         
00159 string SubstringParser::GetParametersInformation()
00160 {
00161         // show user available variables and their current values
00162         // ATTN: this has not been converted from cout to string
00163         string returnstring;
00164         cout << setiosflags(ios::left) << setw(17) << "maxbuildsymbols" << setiosflags(ios::left) << " " << setw(10) << Parameter_MaxSymbols_DuringBuild << " " << "parsing dictionary pruned to not exceed this size" << endl;
00165         cout << setiosflags(ios::left) << setw(17) << "maxsymbols" << setiosflags(ios::left) << " " << setw(10) << Parameter_MaxSymbols_Final << " " << "final dictionary pruned to reach this size" << endl;
00166         cout << setiosflags(ios::left) << setw(17) << "maxSubstring" << setiosflags(ios::left) << " " << setw(10) << Parameter_MaxSubstringSize << " " << "max size of Substrings in dictionary" << endl;
00167         cout << setiosflags(ios::left) << setw(17) << "onlywords" << setiosflags(ios::left) << " " << setw(10) << Parameter_OnlyCodeWholeWords << " " << "only add whole word symbols? (bool)" << endl;
00168         cout << setiosflags(ios::left) << setw(17) << "spanwords" << setiosflags(ios::left) << " " << setw(10) << Parameter_SpanWordBoundaries << " " << "encode symbols that span words? (bool)" << endl;
00169         cout << setiosflags(ios::left) << setw(17) << "crends" << setiosflags(ios::left) << " " << setw(10) << Parameter_CountCRsAsEOTs << " " << "count CRs as end-of-stream symbols? (bool)" << endl;
00170         cout << setiosflags(ios::left) << setw(17) << "minweight" << setiosflags(ios::left) << " " << setw(10) << Parameter_PruneMinimumWeight << " " << "minimum threshold weight for final pruning" << endl;
00171         cout << setiosflags(ios::left) << setw(17) << "smartlookup" << setiosflags(ios::left) << " " << setw(10) << Parameter_UseSmartLookup << " " << "when true, use lower_bound stl dictionary search" << endl;
00172         cout << setiosflags(ios::left) << setw(17) << "parsemode" << setiosflags(ios::left) << " " << setw(10) << Parameter_ParseMode << " " << "parse heuristic (0=greedy 1=lff 2=deep&slow)" << endl;
00173         cout << setiosflags(ios::left) << setw(17) << "recalculations" << setiosflags(ios::left) << " " << setw(10) << Parameter_PruneReCalculations << " " << "recalculate symbol costs during pruning?" << endl;
00174         return returnstring;
00175 }
00176 
00177 
00178 bool SubstringParser::SetParameter(const std::string &parametername,const std::string &parametervalue)
00179 {
00180         // generic function for user-interactive modification of parameters
00181         // (see SetDefaultDictionaryBuildingParameters implementation for descriptions)
00182         // return true if we know this variable and use it
00183         bool bretv=false;
00184         if (parametername=="maxbuildsymbols")
00185                 bretv=ParseParameter(parametervalue,Parameter_MaxSymbols_DuringBuild);
00186         else if (parametername=="maxsymbols")
00187                 bretv=ParseParameter(parametervalue,Parameter_MaxSymbols_Final);
00188         else if (parametername=="maxSubstring")
00189                 bretv=ParseParameter(parametervalue,Parameter_MaxSubstringSize);
00190         else if (parametername=="onlywords")
00191                 bretv=ParseParameter(parametervalue,Parameter_OnlyCodeWholeWords);
00192         else if (parametername=="spanwords")
00193                 bretv=ParseParameter(parametervalue,Parameter_SpanWordBoundaries);
00194         else if (parametername=="crends")
00195                 bretv=ParseParameter(parametervalue,Parameter_CountCRsAsEOTs);
00196         else if (parametername=="minweight")
00197                 bretv=ParseParameter(parametervalue,Parameter_PruneMinimumWeight);
00198         else if (parametername=="smartlookup")
00199                 bretv=ParseParameter(parametervalue,Parameter_UseSmartLookup);
00200         else if (parametername=="parsemode")
00201                 bretv=ParseParameter(parametervalue,Parameter_ParseMode);
00202         else if (parametername=="recalculations")
00203                 bretv=ParseParameter(parametervalue,Parameter_PruneReCalculations);
00204         return bretv;
00205 }
00206 
00207 
00208 void SubstringParser::SetDefaultParameters()
00209 {
00210         // encoding parameters that govern how dictionary is built
00211 
00212         // the dictionary will never be allowed to get bigger than this size (smaller will also increase building speed)
00213         //  when this size is exceeded, we will eliminate all symbols with frequencies of smaller than some small #
00214         Parameter_MaxSymbols_DuringBuild=200000;
00215 
00216         // after the entire training file is parsed, we will repeat pruning steps to reduce it to a *maximum* of this many entries
00217         // the actual symbols kept may be smaller depending on other pruning parameters
00218         Parameter_MaxSymbols_Final=2000;
00219 
00220         // how many characters is largest (sub)string stored in dictionary
00221         Parameter_MaxSubstringSize=12;
00222         //      Parameter_MaxSubstringSize=3;
00223         //      Parameter_MaxSubstringSize=1;
00224 
00225         // only store characters and whole words?
00226         //      Parameter_OnlyCodeWholeWords=false;
00227         Parameter_OnlyCodeWholeWords=false;
00228 
00229         // store Substrings that span words (i.e. they contain multiple words)
00230         Parameter_SpanWordBoundaries=false;
00231 
00232         // minimum count for a word or Substring in order to even consider leaving it in the dictionary
00233         Parameter_PruneMinimumWeight=10;
00234 
00235         // when encoding single lines, each line must end with an end-of-stream (EOS) symbol
00236         //  but if we train on a file, which is large and has only one EOS at end of file
00237         //  then the freq. of EOS will be very low (1), so here we can say to count a pretend EOS
00238         //  whenever we see a CR.
00239         //Parameter_CountCRsAsEOTs=false;
00240         Parameter_CountCRsAsEOTs=true;
00241 
00242         // we can either use a smart search using stl find_lower_bound() to find longest Substring or we can search starting at biggest string backwards
00243         Parameter_UseSmartLookup=true;
00244 
00245         // we can either use a greedy search to find longest Substring starting with first char or we can use smarter but slower heuristics
00246         Parameter_ParseMode=Dictionary_ParseMode_GREEDY;
00247 
00248         // should we recompute symbol costs during pruning? (this actually employs the dictionary set and calculates actual usage counts and then reprunes)
00249         Parameter_PruneReCalculations=true;
00250 }
00251 //---------------------------------------------------------------------------
00252 
00253 
00254 
00255 //---------------------------------------------------------------------------
00256 // PARSER PUBLIC API - PREPARATIONS FOR PARSING
00257         
00258 bool SubstringParser::RewindAnyBufferedInput(bitreader &from)
00259 {
00260         // let go of any buffered stream - this is an odd function that can be called be compressor if it wants to hand
00261         //  off the input stream to a new parser or otherwise access the input bitstream from after last symbol parsed.
00262         //  it is necessary because some parsers can read-ahead in input buffer, and so must rewind the bitstream.
00263         // we have to implement this since we do a read-ahead.  note this wouldn't be called in normal circumstances,
00264         //  just if compressors wants us to stop prematurely and hand off the job to someone else.
00265         // return true on success
00266 
00267         // the number of characters we have read ahead in buffer is inputbufferlen
00268         from.seek_bit(from.tell_bit()-(inputbufferlen*8));
00269         return true;
00270 }
00271 
00272 void SubstringParser::SynchronizeStateForNewStream()
00273 {
00274         // synchronize state for a new stream
00275         // this MUST be called before beginning a new parsing stream
00276         // reset input buffer and end-of-stream sent flag
00277         inputbufferlen=0;
00278         senteos=false;
00279 }
00280 
00281 bool SubstringParser::CreateSymbolSetUsingStream(bitreader &from)
00282 {
00283         // scan a file and add its "tokens" into our symbol set
00284         string valuestring;
00285         string slidingwindowstring;
00286         unsigned char c;
00287         bool slidingwindowfull=false;
00288         string eotstring=string("");
00289         
00290         // clear any existing symbols
00291         FreeData_Symbols();
00292         // create primitive characters
00293         AddPrimitiveCharacterSubstringSymbols();
00294         
00295         // save the current bitreader and bitreader position for reparsing during pruning
00296         current_bitreaderp = &from;
00297         start_bitreaderposition = from.tell_bit();
00298 
00299         // now parse the file
00300         while (!from.empty())
00301                 {
00302                 // grab a character
00303                 c=from.get_byte();
00304 
00305                 // add the character to our window
00306                 slidingwindowstring+=c;
00307 
00308                 // trim lefthand side if it is too big
00309                 if (!slidingwindowfull)
00310                         {
00311                         // we keep a sliding window which is slightly bigger than the maxSubstring, to account for delimiter boundaries
00312                         if ((int)(slidingwindowstring.length())>Parameter_MaxSubstringSize+5)
00313                                 {
00314                                 // set the flag saying from now on, we are full
00315                                 slidingwindowfull=true;
00316                                 }
00317                         }
00318                 if (slidingwindowfull)
00319                         {
00320                         // trim leftmost character
00321                         slidingwindowstring.erase(0,1);
00322                         }
00323 
00324                 // now add the appropriate Substrings of slidingwindowstring to our dictionary SymbolSet
00325                 AddSubstringsFromSlidingWindowString(slidingwindowstring);
00326 
00327                 // when encoding single lines, each line must end with an end-of-stream (EOS) symbol
00328                 //  but if we train on a file, which is large and has only one EOS at end of file
00329                 //  then the freq. of EOS will be very low (1), so here we can say to count a pretend EOS
00330                 //  whenever we see a CR
00331                 if (c==13 && Parameter_CountCRsAsEOTs)
00332                         {
00333                         // increment frequency for EOS
00334                         UpdateValueStringInSymbolSet(eotstring,1);
00335                         }
00336 
00337                 if (symbolset.size()>Parameter_MaxSymbols_DuringBuild)
00338                         {
00339                         // periodically we may have to prune away lowest frequency symbols in order to keep dictionary manageable
00340                                 // how many should we remove - we don't want to remove just one since that would result in many repetitive calls here
00341                         //  so we just pick some fraction to remove
00342                         PruneSymbolSet_ReduceSizeTo(((unsigned int)(symbolset.size())/5)+1);
00343                         }
00344                 }
00345 
00346         // now final pruning
00347         PruneSymbolSet();
00348 
00349         // and now build the vector for fast random access
00350         BuildSymbolVector();
00351         
00352         // return success
00353         return true;
00354 }
00355 //---------------------------------------------------------------------------
00356 
00357 
00358 
00359 //---------------------------------------------------------------------------
00360 // Parser API
00361 
00362 bool SubstringParser::ParseNextSymbolFromInput(bitreader &from, int &symbolnum)
00363 {
00364         // grab an input stream symbol and set its INDEX (in symbol vector) for symbolnum
00365         // return false after EOS
00366         unsigned char c;
00367         int maxSubstringlen=Parameter_MaxSubstringSize;
00368         SubstringSymbol *symbolp;
00369         // note that inputbufferlen are preserved across calls, and inputbufferstr[] is preserved state info
00370 
00371         // try to fill up the inputqueue to Parameter_MaxSubstringSize size, or as much as we got
00372         current_bitreaderp=&from;
00373         while (inputbufferlen<maxSubstringlen && !from.empty())
00374                 {
00375                 // add the character
00376                 c=from.get_byte();
00377                 inputbufferstr[inputbufferlen]=c;
00378                 // increment character positions
00379                 ++inputbufferlen;
00380                 }
00381         
00382         if (inputbufferlen==0)
00383                 {
00384                 // no more symbols left - BUT the question now is, do we return an EOS symbol, or false for no symbols left
00385                 if (senteos)
00386                         {
00387                         // we already sent an EOS so from now on any requests for a symbol returns false saying no more symbols available
00388                         return false;
00389                         }
00390                 else
00391                         {
00392                         // we are going to drop down to return the EOS signal, but we set flag so we don't do it again
00393                         senteos=true;
00394                         }
00395                 }
00396 
00397         // ok now we have a block of up to Parameter_MaxSubstringSize bytes from the left of the inputstr
00398         // now find the longest leftmost (prefix) Substring in our dictionary and encode it, and shift inputqueuestr to the left with remaining bytes
00399         //  and return the new inputqueuestr with remaining bytes, and return length of remaining bytes.
00400         symbolp=FindNextSymbolToEncode(inputbufferstr,inputbufferlen);
00401 //      symbolp=FindNextSubstringSymbolpFromInputQueueStr(inputbufferstr,inputbufferlen);
00402 
00403         symbolnum=symbolp->get_symbolvectorpos();
00404         inputbufferlen=SwallowSymbolFromInputQueueStr(symbolp,inputbufferstr,inputbufferlen,false);
00405         return true;
00406 }
00407 
00408 
00409 bool SubstringParser::WriteSymbolText(bitwriter &to, int symbolnum,bool &isendofstreamsymbol)
00410 {
00411         // write the symbol indexed by symbolnum
00412         // sets isendofostreamsymbol to true or false depending on if the symbol written is the EOS symbol
00413         // return true on success
00414 
00415         SubstringSymbol *symbolp=symbolvector[symbolnum];
00416         int valuelength=(int)((symbolp->get_valuep())->length());
00417         to.write((symbolp->get_valuep())->c_str(),valuelength);
00418 
00419         // set EOS flag
00420         isendofstreamsymbol=(symbolnum==endofstreamsymbolnum);
00421 
00422         // return success
00423         return true;
00424 }
00425 //---------------------------------------------------------------------------
00426 
00427 
00428 
00429 
00430 
00431 
00432 
00433 
00434 
00435 
00436 
00437 
00438 
00439 
00440 //---------------------------------------------------------------------------
00441 // Internal functions for freeing data structure
00442 
00443 void SubstringParser::FreeData()
00444 {
00445         // free symbolset - this will key any SubstringSymbol nodes
00446         FreeData_Symbols();
00447 }
00448 
00449 void SubstringParser::FreeData_Symbols()
00450 {
00451         // free set,vector and symbols
00452         
00453         // we only want to free the symbols once, even if they are in both set and vector
00454         for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
00455                 {
00456                 // delete the pointed to node
00457                 delete (*symbolset_pos);
00458                 }
00459 
00460         // clear main symbolset
00461         symbolset.clear();
00462 
00463         // and now rebuild the vector, which will clear it
00464         BuildSymbolVector();
00465 }
00466 //---------------------------------------------------------------------------
00467 
00468 
00469 //---------------------------------------------------------------------------
00470 // Internal Helper functions for saving and loading state
00471 
00472 bool SubstringParser::SaveParameters(bitwriter &to)
00473 {
00474         // write parameter settings
00475         to.put(Parameter_MaxSymbols_DuringBuild);
00476         to.put(Parameter_MaxSymbols_Final);
00477         to.put(Parameter_MaxSubstringSize);
00478         to.put(Parameter_OnlyCodeWholeWords);
00479         to.put(Parameter_SpanWordBoundaries);
00480         to.put(Parameter_PruneMinimumWeight);
00481         to.put(Parameter_CountCRsAsEOTs);
00482         to.put(Parameter_UseSmartLookup);
00483         to.put(Parameter_ParseMode);
00484         to.put(Parameter_PruneReCalculations);
00485         return true;
00486 }
00487 
00488 
00489 bool SubstringParser::LoadParameters(bitreader &from)
00490 {
00491         // read parameter settings
00492         from.get(Parameter_MaxSymbols_DuringBuild);
00493         from.get(Parameter_MaxSymbols_Final);
00494         from.get(Parameter_MaxSubstringSize);
00495         from.get(Parameter_OnlyCodeWholeWords);
00496         from.get(Parameter_SpanWordBoundaries);
00497         from.get(Parameter_PruneMinimumWeight);
00498         from.get(Parameter_CountCRsAsEOTs);
00499         from.get(Parameter_UseSmartLookup);
00500         from.get(Parameter_ParseMode);
00501         from.get(Parameter_PruneReCalculations);
00502         return true;
00503 }
00504 
00505 
00506 bool SubstringParser::SaveSymbols(bitwriter &to)
00507 {
00508         // write the symboldset to file
00509         // return true on success
00510         unsigned char valuelength;
00511 
00512         // write entries in compact binary form
00513         for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
00514                 WriteSubstringSymbol((*symbolset_pos),to);
00515 
00516         // write an end-of-header in case we want to add this header as prefix of another file
00517         // we use a string-length==255 to mean end of table (no strings ever allowed greater than 254 characters)
00518         valuelength=255;
00519         to.put(valuelength);
00520 
00521         // return success
00522         return true;
00523 }
00524 
00525 
00526 bool SubstringParser::LoadSymbols(bitreader &from)
00527 {
00528         // load a previously saved dictionary from file
00529         bool bretv=true;
00530         bool bretv2;
00531 
00532         // read dictionary
00533         while (!from.empty())
00534                 {
00535                 // read length of the string, as an unsigned car
00536                 bretv2=ReadSubstringSymbol(from);
00537                 if (!bretv2)
00538                         break;
00539                 }
00540 
00541         // and now build the vector for fast random access
00542         BuildSymbolVector();
00543 
00544         // return true on success
00545         return bretv;
00546 }
00547 //---------------------------------------------------------------------------
00548 
00549 
00550 //---------------------------------------------------------------------------
00551 // Internal Parsing
00552 bool SubstringParser::WriteSubstringSymbol(SubstringSymbol *SubstringSymbolp,bitwriter &to)
00553 {
00554         // SubstringSymbol reader/writer - derived classes will take these over if necessary
00555         // return true on success
00556         TSubStrParserWeight weightvalue;
00557         unsigned int weightvalue_uint;
00558         unsigned char valuelength;
00559 
00560         // write length of the string, as an unsigned int
00561         valuelength=(int)(((*symbolset_pos)->get_valuep())->length());
00562         to.put(valuelength);
00563         // write string
00564         to.write(((*symbolset_pos)->get_valuep())->c_str(),valuelength);
00565         // get weight of symbol
00566         weightvalue=(*symbolset_pos)->get_weight();
00567         // SubstringSymbols should be writable as unsigned ints, but we dynamic cast in case, which will throw an exception if not
00568         weightvalue_uint=static_cast<unsigned int>(weightvalue);
00569         to.put(weightvalue_uint);
00570 
00571         // return success
00572         return true;
00573 }
00574 
00575 bool SubstringParser::ReadSubstringSymbol(bitreader &from)
00576 {
00577         // virtual SubstringSymbol reader/writer - derived classes will take these over if necessary
00578         // return false when we hit end
00579         unsigned int weightvalue_uint;
00580         unsigned char valuelength;
00581         char valuestr[256];
00582         std::string valuestring;
00583         
00584         from.get(valuelength);
00585         if (valuelength>=255)
00586                 {
00587                 // all done
00588                 return false;
00589                 }
00590         // read string
00591         from.read(valuestr,valuelength);
00592         valuestring=string(valuestr,valuelength);
00593         // read weight
00594         from.get(weightvalue_uint);
00595         // now add the symbol SubstringSymbol
00596         AddSymbol(valuestring,(TSubStrParserWeight)weightvalue_uint);
00597         return true;
00598 }
00599 //---------------------------------------------------------------------------
00600 
00601 
00602 
00603 //---------------------------------------------------------------------------
00604 // Internal symbol construction
00605 
00606 void SubstringParser::AddPrimitiveCharacterSubstringSymbols()
00607         {
00608         // Push every possible character as SubstringSymbol into the set
00609         string valuestring;
00610         char valuestr[20];
00611 
00612         for ( int i = 0 ; i < 256 ; i++ )
00613                 {
00614                 // form the string value of this (just the dictionary character or word)
00615                 valuestr[0]=i;valuestr[1]='\0';
00616                 valuestring=string(valuestr,1);
00617                 // create a new SubstringSymbol node with this character, with a count of 1 (if you try to add a weight of 0 you will mess up the tree since sums must be strictly increasing)
00618                 UpdateValueStringInSymbolSet(valuestring,1);
00619                 }
00620 
00621         // we are going to use the empty string "" as our EOS (or End of Encoding) symbol, and record its number
00622         valuestring="";
00623         UpdateValueStringInSymbolSet(valuestring,1);
00624         }
00625 
00626 
00627 bool SubstringParser::UpdateValueStringInSymbolSet(const string &stringvalue,int increment)
00628 {
00629         // add a symbol as a SubstringSymbol with 0 count
00630         // return false on a FAILURE to add the symbol (ran out of memory)
00631         bool bretv=true;
00632 
00633         // use a static local variable to reduce memory allocation-deallocation thrashing, and use it for searching
00634         static SubstringSymbol searchSubstringSymbol("",1);
00635         // now we use this searchSubstringSymbol for searching
00636         searchSubstringSymbol.set_value(stringvalue);
00637 
00638         // look it up
00639         symbolset_pos=symbolset.find(&searchSubstringSymbol);
00640         // if its not found, create it
00641         if (symbolset_pos==symbolset.end())
00642                 {
00643                 // create the SubstringSymbol and add it, with weight = increment
00644                 AddSymbol(stringvalue,increment);
00645                 }
00646         else
00647                 {
00648                 // we found it, so increment its count
00649                 (*symbolset_pos)->increment_weight(increment);
00650                 }
00651 
00652         // return false on failure
00653         return bretv;
00654 }
00655 
00656 void SubstringParser::ResetWeights()
00657 {
00658         // reset weights for every symbol
00659         for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
00660                 (*symbolset_pos)->set_weight(0);
00661 }
00662 //---------------------------------------------------------------------------
00663 
00664 
00665 
00666 //---------------------------------------------------------------------------
00667 // Internal symbol construction
00668 
00669 void SubstringParser::AddSubstringsFromSlidingWindowString(string &slidingwindowstring)
00670 {
00671         // According to our parameters, increment frequency counts (adding symbols when needed) of some Substrings from our sliding window
00672         // The way we do this is, our primary character is the rightmost one; thats the one that will be different on each call.
00673         // If we want to do standard huffman on characters, we would simply update the frequency of the rightmost character.
00674         // Alternatively, we have various other parameters which can govern which Substrings we track.
00675         // Some of many settings can generate HUGE trees and sets, but these trees will eventually be pruned of least-useful entries.
00676         // Note this procedure is the most convoluted one in the file, because it deals with various parameters
00677         //  which regulate what kinds of Substrings from the input stream get tokenized.
00678 
00679         string valuestring;
00680         bool bretv;
00681         int len=(int)(slidingwindowstring.length());
00682         unsigned char c,c2;
00683         int leftpos,rightpos,curpos;
00684         bool delimiteronborder;
00685 
00686         // first add the rightmost character, by incrementing its usage count by 1
00687         valuestring=slidingwindowstring.substr(len-1,1);
00688         bretv=UpdateValueStringInSymbolSet(valuestring,1);
00689 
00690         // if the string length is only one character, then we are done.
00691         if (len==1)
00692                 return;
00693 
00694         // reset flag which helps us prevent from adding a keyword which ends in a delimiter
00695         delimiteronborder=false;
00696 
00697         if (Parameter_OnlyCodeWholeWords)
00698                 {
00699                 // we only want to store full words, so if we have not hit a new word boundary, then return
00700                 c=slidingwindowstring[len-1];
00701                 c2=slidingwindowstring[len-2];
00702                 if (len<3)
00703                         {
00704                         // too small to be on a word boundary
00705                         return;
00706                         }
00707                 if (!IsNonWordCharacter(c) && !IsNonWordCharacter(c2))
00708                         {
00709                         // we are within a word
00710                         return;
00711                         }
00712                 if (IsNonWordCharacter(c) && IsNonWordCharacter(c2))
00713                         {
00714                         // we are within a word
00715                         return;
00716                         }
00717                 // start end pointer at last non-delimiter
00718                 rightpos=len-2;
00719                 }
00720         else
00721                 {
00722                 // start end position at last character
00723                 rightpos=len-1;
00724                 }
00725 
00726         // set flag to tell us whether the new character is a delimiter and shouldn't be coded on the end of a real word
00727         if (IsNonWordCharacter(slidingwindowstring[rightpos]))
00728                 {
00729                 // other than adding delimiters as their own characters (done above), we don't want to code them as part of a word
00730                 // so we set a flag here saying that we are only going to move leftward to catch a sequence of delimiters as a Substring, and NOT
00731                 // allows a Substring which has delimiters on the *borders* of words
00732                 delimiteronborder=true;
00733                 }
00734         else
00735                 delimiteronborder=false;
00736 
00737         // set left pos
00738         leftpos=rightpos-Parameter_MaxSubstringSize;
00739         if (leftpos<0)
00740                 leftpos=0;
00741 
00742         // start walking backwards from right to left and add Substrings to dictionary
00743         //  always encoding from curpos to end (rightpos)
00744         for (curpos=rightpos-1;curpos>leftpos;--curpos)
00745                 {
00746                 c=slidingwindowstring[curpos];
00747                 if (delimiteronborder && IsNonWordCharacter(c))
00748                         {
00749                         if (Parameter_OnlyCodeWholeWords && curpos>leftpos)
00750                                 {
00751                                 // since we only code words, we don't want to code this Substring of delimiters yet
00752                                 continue;
00753                                 }
00754                         // because we are parsing a string of delimiters, we want to just drop through to write out this Substring
00755                         }
00756                 else if (IsNonWordCharacter(c) || delimiteronborder)
00757                         {
00758                         if (IsNonWordCharacter(c) && IsNonWordCharacter(slidingwindowstring[curpos+1]))
00759                                 {
00760                                 // we have hit a series of delimiters to the left of a word, so we've already encoded the non-delimiter-bordered stuff to our right
00761                                 // so we don't want to encode it again.
00762                                 continue;
00763                                 }
00764                         if (Parameter_OnlyCodeWholeWords || delimiteronborder)
00765                                 {
00766                                 // we found a new word (or sequence of words), so encode it
00767                                 valuestring=slidingwindowstring.substr(curpos+1,rightpos-curpos);
00768                                 //cout << "type 1 word: "<<valuestring<< endl;
00769                                 bretv=UpdateValueStringInSymbolSet(valuestring,1);
00770                                 if (delimiteronborder)
00771                                         {
00772                                         // we just finished writing out a string of delimiters bordered on left by non-delimiter, so we stop now
00773                                         break;
00774                                         }
00775                                 else if (!Parameter_SpanWordBoundaries)
00776                                         {
00777                                         // since we don't span word boundaries, we are done
00778                                         break;
00779                                         }
00780                                 else
00781                                         {
00782                                         // let it continue to look for next whole word
00783                                         continue;
00784                                         }
00785                                 }
00786                         else if (!Parameter_SpanWordBoundaries)
00787                                 {
00788                                 // we hit a word boundary, so we are done
00789                                 break;
00790                                 }
00791                         else if (IsNonWordCharacter(c) && !delimiteronborder)
00792                                 {
00793                                 // we don't want to let any word Substring start with a delimiter (or end with one, which is checked above)
00794                                 // actually i think by the time we drop down to here this is always true
00795                                 continue;
00796                                 }
00797                         }
00798                 else if (delimiteronborder)
00799                         {
00800                         // our rightmost character was a delimiter so we wont consider any non-delimiters words to the left of it
00801                         break;
00802                         }
00803                 else if (Parameter_OnlyCodeWholeWords)
00804                         {
00805                         // we are in the middle of a word, so we do nothing
00806                         continue;
00807                         }
00808 
00809                 // update the Substring
00810                 valuestring=slidingwindowstring.substr(curpos,(rightpos-curpos)+1);
00811                 bretv=UpdateValueStringInSymbolSet(valuestring,1);
00812                 }
00813 }
00814 //---------------------------------------------------------------------------
00815 
00816 
00817 
00818 
00819 //---------------------------------------------------------------------------
00820 // Internal Parsing
00821 
00822 int SubstringParser::SwallowSymbolFromInputQueueStr(SubstringSymbol *symbolp,char *inputqueuestr,int inputqueuestrlen,bool debugmode)
00823 {
00824         // find which symbol to encode
00825         if (symbolp==NULL)
00826                 {
00827                 // no match found
00828                 return -1;
00829                 }
00830 
00831         // debugmode shows the sub-string used for encoding
00832         if (debugmode)
00833                 cout << symbolp->get_value() << "*";
00834 
00835         // shift inputqueuestrlen
00836         int writepos=0;
00837         for (int pos=(int)((symbolp->get_valuep())->length());pos<inputqueuestrlen;++pos)
00838                 {
00839                 inputqueuestr[writepos]=inputqueuestr[pos];
00840                 ++writepos;
00841                 }
00842         inputqueuestrlen=writepos;
00843 
00844         // return length of shifted inputqueuestr
00845         return inputqueuestrlen;
00846 }
00847 
00848 
00849 SubstringSymbol* SubstringParser::FindNextSymbolToEncode(char *inputqueuestr,int inputqueuestrlen)
00850 {
00851         // ok now we have a block of up to Parameter_MaxSubstringSize bytes from the left of the inputstr
00852         // now find the longest leftmost (prefix) Substring in our dictionary and encode it, and shift inputqueuestr to the left with remaining bytes.
00853         // return the new inputqueuestr with remaining bytes, and return length of remaining bytes.
00854         SubstringSymbol *symbolnodep = 0;
00855 
00856         if (Parameter_ParseMode==Dictionary_ParseMode_GREEDY || inputqueuestrlen<=2)
00857                 {
00858                 // GREEDY PARSE
00859                 // find biggest matching prefix in symbol dictionary - this should never fail unless the dictionary is not built at all,
00860                 //  since the dictionary is supposed to contain every character, so at least the leftmost character should match
00861                 symbolnodep = FindNextSymbolToEncode_Greedy(inputqueuestr,inputqueuestrlen);
00862                 }
00863         else if (Parameter_ParseMode==Dictionary_ParseMode_LFF)
00864                 {
00865                 // SMARTER HEURISTIC PARSE (slower than greedy, but *maybe* better)
00866                 int bestindex=inputqueuestrlen;
00867                 int bestscore=-1;
00868                 for (int index=0;index<=inputqueuestrlen-1;++index)
00869                         {
00870                         // find length of the string beginning at index
00871                         // early stop if we know we cant do better
00872                         if (bestscore>=inputqueuestrlen-index)
00873                                 break;
00874                         symbolnodep = FindNextSymbolToEncode_Greedy(&inputqueuestr[index],inputqueuestrlen-index);
00875                         if (symbolnodep==NULL)
00876                                 break;
00877                         if (bestscore==-1 || symbolnodep->get_valuelen()>bestscore)
00878                                 {
00879                                 bestscore=symbolnodep->get_valuelen();
00880                                 bestindex=index;
00881                                 }
00882                         }
00883                 // now that we found the longest Substring, parse the string BEFORE it
00884                 if (bestindex==0)
00885                         bestindex=inputqueuestrlen;
00886                 symbolnodep = FindNextSymbolToEncode_Greedy(inputqueuestr,bestindex);
00887                 }
00888         else if (Parameter_ParseMode==Dictionary_ParseMode_DEEP)
00889                 {
00890                 // SMARTER HEURISTIC PARSE (slower than greedy, but *maybe* better)
00891                 double encodecost;
00892                 double bestscore;
00893                 int longestlen=inputqueuestrlen;
00894                 int bestindex=longestlen;
00895                 bool firstscore=true;
00896                 do
00897                         {
00898                         bestscore=-1;
00899                         longestlen=bestindex;
00900                         for (int index=1;index<=longestlen;++index)
00901                                 {
00902                                 // find cost if we encode based on split at index
00903                                 encodecost=CalculateEncodeCost(inputqueuestr,index);
00904                                 if (index<longestlen)
00905                                 encodecost+=CalculateEncodeCost(&inputqueuestr[index],longestlen-index);
00906                                 // update our tracking of best score
00907                                 if (firstscore || encodecost<=bestscore)
00908                                         {
00909                                         bestscore=encodecost;
00910                                         bestindex=index;
00911                                         firstscore=false;
00912                                         }
00913                                 }
00914                         } while (bestindex!=longestlen);
00915                 // and do the search with this limit (note we could have asked CalculateEncodeCost to return best found first match
00916                 symbolnodep = FindNextSymbolToEncode_Greedy(inputqueuestr,bestindex);
00917                 }
00918         else
00919                 {
00920                 // unknown parse mode
00921                 cout << "ERROR: unknown parse mode "<<Parameter_ParseMode<<"."<<endl;
00922                 }
00923 
00924         return symbolnodep;
00925         }
00926 
00927 
00928 SubstringSymbol* SubstringParser::FindNextSymbolToEncode_Greedy(char *inputqueuestr,int inputqueuestrlen)
00929 {
00930         // find biggest matching prefix in symbol dictionary - this should never fail unless the dictionary is not built at all,
00931         //  since the dictionary is supposed to contain every character, so at least the leftmost character should match
00932         // return a pointer to the symbol, or NULL if there is some error and we cant find one.
00933 
00934         // there are several ways to do this dictionary search within the SymbolSet, which is a binary tree
00935         // one way is to start with full string and gradually shorten it until we get an exact match
00936         // another is to ask the stl set to find the nearest match, and search back from there for shortest exact match
00937         int originputqueuestrlen=inputqueuestrlen;
00938         string inputstring;
00939 
00940         // use a static local variable to reduce memory allocation-deallocation thrashing
00941         static SubstringSymbol searchSubstringSymbol("",1);
00942 
00943         // convert char * to string, using exactly inputqueuestrlen, regardless of any '\0'
00944         if (inputqueuestrlen==0)
00945                 inputstring="";
00946         else
00947                 inputstring=string(inputqueuestr,inputqueuestrlen);
00948 
00949         // truncate it to max length of any symbol, incase caller sometimes passes more
00950         if (inputqueuestrlen>Parameter_MaxSubstringSize)
00951                 {
00952                 inputqueuestrlen=Parameter_MaxSubstringSize;
00953                 inputstring=inputstring.substr(0,inputqueuestrlen);
00954                 }
00955 
00956         // now we use this searchSubstringSymbol for searching
00957         searchSubstringSymbol.set_value(inputstring);
00958 
00959         // find the longest Substring, using a smarter use of stl lower_bound instead of find()
00960         if (Parameter_UseSmartLookup)
00961                 {
00962                 // smarter method (22:00 seconds on a test, about 3x faster than slow method below
00963                 //  ATTN: can we simplify this?
00964                 int len;
00965                 int index;
00966                 string teststring;
00967                 for (;;)
00968                         {
00969                         symbolset_pos=symbolset.lower_bound(&searchSubstringSymbol);
00970                         // check for error
00971                         if (symbolset_pos==symbolset.end())
00972                                 {
00973                                 // nothing smaller so the LAST symbol, single character (we verify to make sure there isnt an error)
00974                                 --symbolset_pos;
00975                                 // shorten main string to match
00976                                 inputqueuestrlen=1;
00977                                 inputstring=inputstring.substr(0,inputqueuestrlen);
00978                                 teststring=(*symbolset_pos)->get_value();
00979                                 if (inputstring[0]!=teststring[0])
00980                                         {
00981                                         cout << "ERROR: Unexpectedly didn't find any match for a string that started out "<<originputqueuestrlen<<" characters (we hit "<<inputqueuestrlen<<").  last char was "<<(int)((unsigned char)(inputqueuestr[0]))<<"."<<endl;
00982                                         return NULL;
00983                                         }
00984                                 // ok the best match is the last symbol
00985                                 searchSubstringSymbol.set_value(inputstring);
00986                                 break;
00987                                 }
00988 
00989                         //teststring=(*symbolset_pos)->get_value();
00990                         //len=(int)(teststring.length());
00991                         //cout << "lowerbound for '"<<inputstring<<"' = '" << teststring <<"' (len="<<len<<")."<<endl;;
00992 
00993                         // if we have an exact match we are done
00994                         teststring=(*symbolset_pos)->get_value();
00995                         len=(int)(teststring.length());
00996                         if (teststring==inputstring)
00997                                 break;
00998 
00999                         // try one previous
01000                         --symbolset_pos;
01001                         teststring=(*symbolset_pos)->get_value();
01002                         len=(int)(teststring.length());
01003                         if (len<inputqueuestrlen)
01004                                 {
01005                                 // shorten main string to match
01006                                 inputqueuestrlen=len;
01007                                 inputstring=inputstring.substr(0,inputqueuestrlen);
01008                                 }
01009                         // if we have an exact match we are done
01010                         if (teststring==inputstring)
01011                                 break;
01012 
01013                         // if not, find the last character they agree on and find next bound
01014                         len=(int)(teststring.length());
01015                         for (index=0;index<len;++index)
01016                                 {
01017                                 if (teststring[index]!=inputstring[index])
01018                                         break;
01019                                 }
01020 
01021                         // now shorten it and try again
01022                     inputqueuestrlen=index;
01023                     if (inputqueuestrlen==0)
01024                                 {
01025                                 cout << "ERROR: unexpectedly truncated search string that started out "<<originputqueuestrlen<<" characters (we hit "<<inputqueuestrlen<<").  last char was "<<(int)inputqueuestr[0]<<"."<<endl;
01026                         inputstring="";
01027                         }
01028                         else
01029                                 inputstring=inputstring.substr(0,inputqueuestrlen);
01030                         searchSubstringSymbol.set_value(inputstring);
01031                         }
01032                 }
01033         else
01034                 {
01035                 // dumb method (67:00 seconds on a test, about 3x slower than smart method)
01036                 for(;;)
01037                         {
01038                         symbolset_pos=symbolset.find(&searchSubstringSymbol);
01039                         if (symbolset_pos!=symbolset.end())
01040                                 {
01041                                 // we found an exact match
01042                                 break;
01043                                 }
01044                         // not found, so shorten string by removing rightmost character
01045                         --inputqueuestrlen;
01046                         if (inputqueuestrlen<=0)
01047                                 {
01048                                 cout << "ERROR: unexpectedly didn't find any match for a string that started out with "<<originputqueuestrlen<<" characters (we hit "<<inputqueuestrlen<<").  last char was "<<(int)inputqueuestr[0]<<"."<<endl;
01049                                 return NULL;
01050                                 }
01051                         // decrease length of Substring currently being searched for and continue
01052                         inputstring=inputstring.substr(0,inputqueuestrlen);
01053                         searchSubstringSymbol.set_value(inputstring);
01054                         }
01055                 }
01056 
01057         // return the pointer
01058         return *symbolset_pos;
01059 }
01060 
01061 
01062 double SubstringParser::CalculateEncodeCost(char *inputqueuestr,int inputqueuestrlen)
01063 {
01064         // try to find the optimal cost of encoding the string, using currently selected methods
01065         //  and return its cost, ASSUMING that we start by parsing longest matching leftmost substr
01066         SubstringSymbol* symbolnodep;
01067         double cost=0;
01068 
01069         // start by greedy parse of biggest leftmost substr
01070         symbolnodep = FindNextSymbolToEncode_Greedy(inputqueuestr,inputqueuestrlen);
01071         if (symbolnodep==NULL)
01072                 {
01073                 // no match found - this should never happen
01074                 cout << "$no match found."<<endl;
01075                 return 99999;
01076                 }
01077 
01078         // add cost of initial Substring
01079         cost+=GetSymbolCost(symbolnodep);
01080 
01081         // advance pointer
01082         inputqueuestr+=symbolnodep->get_valuelen();
01083         inputqueuestrlen-=symbolnodep->get_valuelen();
01084 
01085         // now the remainder of the string could be done smart or greedy
01086         if (Parameter_ParseMode!=Dictionary_ParseMode_GREEDY && inputqueuestrlen>2)
01087                 {
01088                 // SMARTER HEURISTIC PARSE (slower than greedy, but *maybe* better)
01089                 int bestindex=inputqueuestrlen;
01090                 double bestscore=0;
01091                 double encodecost;
01092                 bool firstscore=true;
01093                 for (int index=1;index<=inputqueuestrlen;++index)
01094                         {
01095                         // find cost if we encode based on split at index
01096                         encodecost=CalculateEncodeCost(inputqueuestr,index);
01097                         if (index<inputqueuestrlen)
01098                         encodecost+=CalculateEncodeCost(&inputqueuestr[index],inputqueuestrlen-index);
01099                         // update our tracking of best score
01100                         if (firstscore || encodecost<bestscore)
01101                                 {
01102                                 bestscore=encodecost;
01103                                 bestindex=index;
01104                                 firstscore=false;
01105                                 }
01106                         }
01107                 // now set cost to best cost found
01108                 cost+=bestscore;
01109                 }
01110         else if (inputqueuestrlen>0)
01111                 {
01112                 // GREEDY PARSE cost
01113                 cost+=CalculateEncodeCost(inputqueuestr,inputqueuestrlen);
01114                 }
01115         else
01116                 {
01117                 // nothing left to score
01118                 }
01119 
01120         // now return cost
01121         return cost;
01122 }
01123 //---------------------------------------------------------------------------
01124 
01125 
01126 
01127 
01128 //---------------------------------------------------------------------------
01129 // Internal Pruning functions
01130 
01131 void SubstringParser::PruneSymbolSet_WeightThreshold(TSubStrParserWeight thresholdweight)
01132 {
01133         // remove all weights below some threshold set by parameter
01134         // this is a fast and easy step that will remove almost the entire current symbol set :)
01135         TSymbolSetTYPE::iterator nextsymbolpos;
01136         TSubStrParserWeight weight;
01137         int index=0;
01138 
01139         // First simplest pruning operation - simply remove all symbols which don't have some threshold count
01140         for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();)
01141                 {
01142                 ++index;
01143                 nextsymbolpos=symbolset_pos;
01144                 ++nextsymbolpos;
01145                 // only show non-zero weights (i.e. symbols used at least once)
01146                 if (!((*symbolset_pos)->dontprune()))
01147                         {
01148                         weight=(*symbolset_pos)->get_weight();
01149                         if (weight<thresholdweight)
01150                                 {
01151                                 // delete this symbol
01152                                 delete (*symbolset_pos);
01153                                 symbolset.erase(symbolset_pos);
01154                                 }
01155                         }
01156                 // advance pointer
01157                 symbolset_pos=nextsymbolpos;
01158                 }
01159 }
01160 
01161 
01162 void SubstringParser::PruneSymbolSet_RemoveRedundantPrefixes()
01163 {
01164         // some Substring symbols are simply prefixes of others, with no independent occurrences
01165         //  for example the Substring "supercomput" is a redundant Substring of "supercomputer"
01166         //  so we can remove those with no cost.
01167         TSymbolSetTYPE::iterator nextsymbolpos;
01168         TSymbolSetTYPE::iterator lastsymbolpos;
01169         TSubStrParserWeight weight,lastweight;
01170         string lastvalue;
01171         int lastvaluelen;
01172 
01173         lastweight=0;
01174         for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();)
01175                 {
01176                 nextsymbolpos=symbolset_pos;
01177                 ++nextsymbolpos;
01178                 // some symbols (like primitive characters, cannot be pruned)
01179                 if ((*symbolset_pos)->dontprune())
01180                         lastweight=0;
01181                 else
01182                         {
01183                         weight=(*symbolset_pos)->get_weight();
01184                         if (weight==lastweight && weight!=0)
01185                                 {
01186                                 // got a candidate for pruning, IFF our prior string is a proper prefix of us
01187                                 lastvaluelen=(int)(lastvalue.length());
01188                                 if (((*symbolset_pos)->get_value()).compare(0,lastvaluelen,lastvalue.c_str(),lastvaluelen)==0)
01189                                         {
01190                                         // yes it is, so kill our last value
01191                                         delete *lastsymbolpos;
01192                                         symbolset.erase(lastsymbolpos);
01193                                         }
01194                                 }
01195                         // remember last symbol details
01196                         lastweight=weight;
01197                         lastvalue=(*symbolset_pos)->get_value();
01198                         lastsymbolpos=symbolset_pos;
01199                         }
01200                 // advance pointer
01201                 symbolset_pos=nextsymbolpos;
01202                 }
01203 }
01204 
01205 
01206 void SubstringParser::PruneSymbolSet_ReduceSizeTo(unsigned int targetsize)
01207 {
01208         // we have exceeded our maximum # of symbols allowed during build process, so reduce the size
01209         //  this is done quite inefficiently, but we can worry about that later, or not, since its not in a time-critical place
01210         // ATTN: a smarter way would be to sort by weight and pop off smallest weights.
01211         if (symbolset.size()<targetsize)
01212                 return;
01213 
01214         for (int count=2;count<10000;++count)
01215                 {
01216                 // remove all removable symbols with a weight < count and then check set size again
01217                 PruneSymbolSet_WeightThreshold((TSubStrParserWeight)count);
01218                 if (symbolset.size()<=targetsize)
01219                         break;
01220                 }
01221 }
01222 
01223 
01224 void SubstringParser::PruneSymbolSet()
01225 {
01226         // we need to reduce the size of the dictionary SymbolSet by pruning symbols we think are not worth the coding space
01227         //  having silly symbols increases our cost in two main ways:
01228         //   1) cost in memory needed to store the dictionary in memory and time needed to search dictionary
01229         //   2) cost incurred due to the fact that the more words in dictionary, the more bits needed to encode other symbols.
01230         //      this is a very serious issue and suggests the dictionary should be as small as possible.
01231         // the bitreader is passed so that we can reparse the input in order to test our created symbol set.
01232 
01233         // display info
01234         cout << endl << "Total symbols in dictionary before pruning: " << GetSetSymbolCount() << endl;
01235 
01236         // remove weights below a threshold
01237         PruneSymbolSet_WeightThreshold(Parameter_PruneMinimumWeight);
01238         cout << "After Stage 1 Pruning (removed scores below "<<Parameter_PruneMinimumWeight<<"), symbols: " << GetSetSymbolCount() << endl;
01239 
01240         // remove redundant prefixes
01241         PruneSymbolSet_RemoveRedundantPrefixes();
01242         cout << "After Stage 2 Pruning (removed redundant prefixes), symbols: " << GetSetSymbolCount() << endl;
01243 
01244         // pre-capping recompression
01245         if (Parameter_PruneReCalculations)
01246                 {
01247                 // now actually re-compress the file using current symbol set to get better real-world weight counts, and repeat prunings
01248                 ReParseFileReComputeCosts(true);
01249                 PruneSymbolSet_WeightThreshold(Parameter_PruneMinimumWeight);
01250                 PruneSymbolSet_RemoveRedundantPrefixes();
01251                 // some info for user
01252                 cout << "After pre-ceil pre-compression, re-evaluation, and repruning, symbols: " << GetSetSymbolCount() << endl;
01253                 }
01254 
01255         // finally, cap symbol count.  there are smart and dumb ways this can be done
01256         PruneSymbolSet_ReduceSizeTo(Parameter_MaxSymbols_Final);
01257         cout << "After Stage 3 Pruning (killed worst scores to reach target), symbols: " << GetSetSymbolCount() << endl;
01258 
01259         // more recompression loops after capping
01260         if (Parameter_PruneReCalculations)
01261                 {
01262                 // now actually re-compress the file using current symbol set to get better real-world weight counts, and repeat prunings
01263                 ReParseFileReComputeCosts(true);
01264                 PruneSymbolSet_WeightThreshold(Parameter_PruneMinimumWeight);
01265                 PruneSymbolSet_RemoveRedundantPrefixes();
01266                 cout << "After re-compression, re-evaluation, and repruning, symbols: " << GetSetSymbolCount() << endl;
01267                 }
01268 }
01269 //---------------------------------------------------------------------------
01270 
01271 
01272 
01273 
01274 //---------------------------------------------------------------------------
01275 // For reparsing and recomputing scores
01276 
01277 double SubstringParser::ReParseFileReComputeCosts(bool updateweights)
01278 {
01279         // USING the current symbolset we have just created, reparse it and recalculate the 'cost'
01280         //  and frequency of each symbol.
01281         // This is useful because as we create and modify(prune) the symbolset, the symbols interact
01282         //  and effect frequencies, so its helpful to reparse the input and recalculate frequencies.
01283         // return sum cost of parsed symbols IFF updateweights == false
01284 
01285         // First we need to rebuild the symbol vector before we can parse
01286         BuildSymbolVector();
01287 
01288         int symbolnum;
01289         SubstringSymbol *symbolp;
01290         double totalcost=0;
01291 
01292         // reset input bitreader to where we started dictionarizing
01293         current_bitreaderp->seek_bit(start_bitreaderposition);
01294 
01295         // reset weights if we are recalculating them
01296         if (updateweights)
01297                 ResetWeights();
01298 
01299         // loop through input grabbing symbols
01300         // ATTN: are we properly updating weights?
01301         while (ParseNextSymbolFromInput(*current_bitreaderp,symbolnum))
01302                 {
01303                 symbolp=symbolvector[symbolnum];
01304                 if (updateweights)
01305                         {
01306                         // we can be run in a mode to actually update weights while encoding
01307                         if (updateweights)
01308                                 {
01309                                 // note this will not change priority queue, and is therefore somewhat skidding on what is 'allowed' by stl
01310                                 //  but we use this purposefully so that we are modifying the weights while preserving current priority queue.
01311                                 symbolp->increment_weight(1);
01312                                 // we need to also keep track of CRs for EOS's if appropriate
01313                                 if (Parameter_CountCRsAsEOTs && strchr(symbolp->get_valuep()->c_str(),13)!=NULL)
01314                                         {
01315                                         symbolp=FindNextSymbolToEncode_Greedy("",0);
01316                                         if (symbolp!=NULL)
01317                                                 symbolp->increment_weight(1);
01318                                         }
01319                                 }
01320                         }
01321                 else
01322                         totalcost+=symbolp->get_cost();
01323                 }
01324 
01325         // return success
01326         return totalcost;
01327 }
01328 //---------------------------------------------------------------------------
01329 
01330 
01331 //---------------------------------------------------------------------------
01332 // Internal symbol construction
01333 
01334 void SubstringParser::BuildSymbolVector()
01335 {
01336         // build a vector of symbol pointers, for efficient random access
01337         // clear any existing vector
01338         symbolvector.clear();
01339         // add symbols from set
01340         int index=0;
01341         for (symbolset_pos=symbolset.begin();symbolset_pos!=symbolset.end();++symbolset_pos)
01342                 {
01343                 // add symbol to vector
01344                 symbolvector.push_back(*symbolset_pos);
01345                 // tell symbol what number it is
01346                 (*symbolset_pos)->set_symbolvectorpos(index++);
01347                 }
01348         // now we need to update our count of how many symbols we have
01349         symbolcount=(int)(symbolvector.size());
01350         // and record the symbol representing the end-of-stream (EOS is "")
01351         endofstreamsymbolnum=LookupSymbolNumberFromText(string(""));
01352 }
01353 
01354 
01355 int SubstringParser::LookupSymbolNumberFromText(string wordstring)
01356 {
01357         // return the number of a symbol in the parsing symbolset (or -1 if not found)
01358         static SubstringSymbol searchSubstringSymbol("",1);
01359         // now we use this searchSubstringSymbol for searching
01360         searchSubstringSymbol.set_value(wordstring);
01361         symbolset_pos=symbolset.find(&searchSubstringSymbol);
01362         // if its not found, return -1
01363         if (symbolset_pos==symbolset.end())
01364                 return -1;
01365         // otherwise return its index symbol number
01366         return ((*symbolset_pos)->get_symbolvectorpos());
01367 }
01368 //---------------------------------------------------------------------------