00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #ifndef Octane_SubstringParserH
00019 #define Octane_SubstringParserH
00020
00021
00022
00023
00024 #include "../parser.hpp"
00025 #include "../../bitio/bitio.hpp"
00026
00027 #include <vector>
00028 #include <set>
00029 #include <assert.h>
00030 using namespace std;
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040 #define Dictionary_ParseMode_GREEDY 0
00041 #define Dictionary_ParseMode_LFF 1
00042 #define Dictionary_ParseMode_DEEP 2
00043
00044
00045
00046
00047
00048 typedef unsigned int TSubStrParserWeight;
00049 #define TSubStrParserWeight_MAX UINTMAX
00050
00051
00052 #define SubStrHuff_BiggestSubStringLen 100
00053
00054
00055
00056
00057
00058
00059
00067 class SubstringSymbol
00068 {
00069 protected:
00071 TSubStrParserWeight weight;
00073 std::string value;
00075 int symbolvectorpos;
00076 public:
00077
00078 SubstringSymbol () {weight=0;value="";}
00079 SubstringSymbol (const std::string &invalue, TSubStrParserWeight inweight=0) {weight=inweight; value=invalue;}
00080 ~SubstringSymbol () {;}
00081 public:
00083 bool operator>( const SubstringSymbol * &a ) const {return weight > (a->get_weight());}
00084 public:
00086 TSubStrParserWeight get_weight() const {return weight;}
00088 void increment_weight(int increment) {weight+=increment;}
00090 void set_weight(TSubStrParserWeight val) {weight=val;}
00092 std::string* get_valuep() {return &value;}
00094 std::string& get_value() {return value;}
00096 int get_valuelen() {return (int)(value.length());}
00098 void set_value(const std::string &invalue) {value=invalue;}
00100 int get_symbolvectorpos() {return symbolvectorpos;};
00102 void set_symbolvectorpos(int inpos) {symbolvectorpos=inpos;};
00103 public:
00106 bool dontprune() {if (value.length()<=1) return true; else return false;}
00108 unsigned int get_memoryused() const {return (unsigned int)(sizeof(this)+value.capacity());}
00112 double get_cost() {return 1.0/(1.0+weight);};
00113 };
00114
00115
00120 template< class T >
00121 class TSubstringSymbolWeightGreater
00122 {
00123 public:
00124 bool operator() (const T& pLeft, const T& pRight) const
00125 { return pLeft->get_weight()>pRight->get_weight(); }
00126 };
00127
00128
00133 template< class T >
00134 class TSubstringSymbolStringGreater
00135 {
00136 public:
00137 bool operator() (const T& pLeft, const T& pRight) const
00138 { return pLeft->get_value()<pRight->get_value(); }
00139 };
00140
00141
00142
00143
00144
00154 class SubstringParser : public OctaneParser
00155 {
00156 protected:
00158 typedef std::set< SubstringSymbol* , TSubstringSymbolStringGreater<SubstringSymbol*> > TSymbolSetTYPE;
00160 typedef std::vector< SubstringSymbol* > TSymbolVectorTYPE;
00162 TSymbolSetTYPE symbolset;
00164 TSymbolVectorTYPE symbolvector;
00166 TSymbolSetTYPE::iterator symbolset_pos;
00168 TSymbolVectorTYPE::iterator symbolvector_pos;
00170 char inputbufferstr[SubStrHuff_BiggestSubStringLen] ;
00172 int inputbufferlen;
00175 bool senteos;
00176 protected:
00177
00180 unsigned int Parameter_MaxSymbols_DuringBuild;
00183 unsigned int Parameter_MaxSymbols_Final;
00185 int Parameter_MaxSubstringSize;
00187 bool Parameter_OnlyCodeWholeWords;
00189 bool Parameter_SpanWordBoundaries;
00190
00191
00192
00193
00194 bool Parameter_CountCRsAsEOTs;
00196 unsigned int Parameter_PruneMinimumWeight;
00198 int Parameter_ParseMode;
00200 bool Parameter_UseSmartLookup;
00203 bool Parameter_PruneReCalculations;
00204 protected:
00206 int symbolcount;
00208 int endofstreamsymbolnum;
00210 bitreader *current_bitreaderp;
00212 size_t start_bitreaderposition;
00213 public:
00214
00216 SubstringParser();
00218 ~SubstringParser();
00219
00220 public:
00221
00222
00223 virtual bool CreateSymbolSetUsingStream(bitreader &from);
00224 virtual bool IsReadyToParse() {if (symbolcount>0) return true; else return false;};
00225 virtual bool RewindAnyBufferedInput(bitreader &from);
00226
00227 public:
00228
00229
00231 virtual void SynchronizeStateForNewStream();
00232 virtual int GetSymbolCount() {return symbolcount;};
00233 virtual bool ParseNextSymbolFromInput(bitreader &from, int &symbolnum);
00234 virtual bool WriteSymbolText(bitwriter &to, int symbolnum,bool &isendofstreamsymbol);
00235 virtual string LookupSymbolText(int symbolnum) {assert(symbolnum<(int)(symbolvector.size()));return symbolvector[symbolnum]->get_value();};
00236
00237 public:
00238
00239
00240 virtual void ShowDebuggingInfo();
00241 virtual unsigned int GetMemoryUsed();
00242 virtual unsigned int GetDiskspaceUsed(bool fortempdecompressiononly);
00243 virtual std::string GetParametersInformation();
00244 virtual void SetDefaultParameters();
00245 virtual bool SetParameter(const std::string ¶metername,const std::string ¶metervalue);
00246 virtual bool SaveState(bitwriter &to,bool fortempdecompressiononly);
00247 virtual bool LoadState(bitreader &from);
00248
00249 public:
00250
00251
00252 int GetSetSymbolCount() {return (int)(symbolset.size());};
00253 int LookupSymbolNumberFromText(string wordstring);
00254
00255 private:
00256
00259 bool SaveSymbols(bitwriter &to);
00262 bool LoadSymbols(bitreader &from);
00263 private:
00264
00266 void AddPrimitiveCharacterSubstringSymbols();
00270 bool UpdateValueStringInSymbolSet(const string &stringvalue,int increment);
00272 void AddSubstringsFromSlidingWindowString(string &slidingwindowstring);
00274 SubstringSymbol* AddSymbol(const std::string &stringvalue, const TSubStrParserWeight weightvalue) {SubstringSymbol* p=new SubstringSymbol(stringvalue,weightvalue);if (p!=NULL) symbolset.insert(p);return p;}
00275 private:
00276
00279 bool SaveParameters(bitwriter &to);
00282 bool LoadParameters(bitreader &from);
00285 bool ReadSubstringSymbol(bitreader &from);
00288 bool WriteSubstringSymbol(SubstringSymbol *SubstringSymbolp,bitwriter &to);
00289 private:
00291 void ResetWeights();
00292 private:
00293
00295 void PruneSymbolSet();
00297 void PruneSymbolSet_ReduceSizeTo(unsigned int targetsize);
00299 void PruneSymbolSet_WeightThreshold(TSubStrParserWeight thresholdweight);
00301 void PruneSymbolSet_RemoveRedundantPrefixes();
00304 double ReParseFileReComputeCosts(bool updateweights);
00305 private:
00308 double GetSymbolCost(SubstringSymbol* symbolnodep) {return symbolnodep->get_cost();};
00309 private:
00310
00313 SubstringSymbol* FindNextSymbolToEncode(char *inputqueuestr,int inputqueuestrlen);
00317 SubstringSymbol* FindNextSymbolToEncode_Greedy(char *inputqueuestr,int inputqueuestrlen);
00319 int SwallowSymbolFromInputQueueStr(SubstringSymbol *symbolp,char *inputqueuestr,int inputqueuestrlen,bool debugmode);
00322 double CalculateEncodeCost(char *inputqueuestr,int inputqueuestrlen);
00323 private:
00325 void BuildSymbolVector();
00326 private:
00328 void FreeData();
00330 void FreeData_Symbols();
00331 };
00332
00333
00334
00335
00336
00337
00338
00339 #endif
00340
00341