Octane v1.01.20 - The Open Compression Toolkit for C++ http://octane.sourceforge.net/
Homepage | Main | Modules | Class Hierarchy | Compound List | File List | Compound Members | Related Pages

parsers/parser_substring/substringparser.hpp

Go to the documentation of this file.
00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 //---------------------------------------------------------------------------
00015 
00016 //---------------------------------------------------------------------------
00017 // recursive header protection
00018 #ifndef Octane_SubstringParserH
00019 #define Octane_SubstringParserH
00020 //---------------------------------------------------------------------------
00021 
00022 //---------------------------------------------------------------------------
00023 // application includes
00024 #include "../parser.hpp"
00025 #include "../../bitio/bitio.hpp"
00026 // system includes
00027 #include <vector>
00028 #include <set>
00029 #include <assert.h>
00030 using namespace std;
00031 //---------------------------------------------------------------------------
00032 
00033 
00034 
00035 
00036 //---------------------------------------------------------------------------
00037 // Different types of parsing modes (greedy is fastest and usually not
00038 //  significantly worse than LFF (longest substring first).  Deep is super
00039 //  slow and not noticeably better.
00040 #define Dictionary_ParseMode_GREEDY             0
00041 #define Dictionary_ParseMode_LFF                1
00042 #define Dictionary_ParseMode_DEEP               2
00043 //---------------------------------------------------------------------------
00044 
00045 
00046 //---------------------------------------------------------------------------
00047 // What kind of precision should we use for the weights? int should be fine.
00048 typedef unsigned int TSubStrParserWeight;
00049 #define TSubStrParserWeight_MAX UINTMAX
00050 
00051 // Biggest substring we will ever parser
00052 #define SubStrHuff_BiggestSubStringLen 100
00053 //---------------------------------------------------------------------------
00054 
00055 
00056 
00057 
00058 
00059 //---------------------------------------------------------------------------
00067 class SubstringSymbol
00068 {
00069 protected:
00071         TSubStrParserWeight weight;
00073         std::string value;
00075         int symbolvectorpos;
00076 public:
00077         // Constructor and destructor
00078         SubstringSymbol () {weight=0;value="";}
00079         SubstringSymbol (const std::string &invalue, TSubStrParserWeight inweight=0) {weight=inweight; value=invalue;}
00080         ~SubstringSymbol () {;}
00081 public:
00083         bool operator>( const SubstringSymbol * &a ) const {return weight > (a->get_weight());}
00084 public:
00086         TSubStrParserWeight get_weight() const {return weight;}
00088         void increment_weight(int increment) {weight+=increment;}
00090         void set_weight(TSubStrParserWeight val) {weight=val;}
00092         std::string* get_valuep() {return &value;}
00094         std::string& get_value() {return value;}
00096         int get_valuelen() {return (int)(value.length());}
00098         void set_value(const std::string &invalue) {value=invalue;}
00100         int get_symbolvectorpos() {return symbolvectorpos;};
00102         void set_symbolvectorpos(int inpos) {symbolvectorpos=inpos;};
00103 public:
00106         bool dontprune() {if (value.length()<=1) return true; else return false;}
00108         unsigned int get_memoryused() const {return (unsigned int)(sizeof(this)+value.capacity());}
00112         double get_cost() {return 1.0/(1.0+weight);};
00113 };
00114 
00115 
00120 template< class T >
00121 class TSubstringSymbolWeightGreater
00122         {
00123         public:
00124                 bool operator() (const T& pLeft, const T& pRight) const
00125                         { return pLeft->get_weight()>pRight->get_weight(); }
00126         };
00127 
00128 
00133 template< class T >
00134 class TSubstringSymbolStringGreater
00135         {
00136         public:
00137                 bool operator() (const T& pLeft, const T& pRight) const
00138                         { return pLeft->get_value()<pRight->get_value(); }
00139         };
00140 //---------------------------------------------------------------------------
00141 
00142 
00143 
00144 //---------------------------------------------------------------------------
00154 class SubstringParser : public OctaneParser
00155 {
00156 protected:
00158         typedef std::set< SubstringSymbol* , TSubstringSymbolStringGreater<SubstringSymbol*> > TSymbolSetTYPE;
00160         typedef std::vector< SubstringSymbol* > TSymbolVectorTYPE;
00162         TSymbolSetTYPE symbolset;
00164         TSymbolVectorTYPE symbolvector;
00166         TSymbolSetTYPE::iterator symbolset_pos;
00168         TSymbolVectorTYPE::iterator symbolvector_pos;
00170         char inputbufferstr[SubStrHuff_BiggestSubStringLen] ;
00172         int inputbufferlen;
00175         bool senteos;
00176 protected:
00177         // Encoding parameters which govern how dictionary is built (see SetDefaultDictionaryBuildingParameters implementation for descriptions)
00180         unsigned int Parameter_MaxSymbols_DuringBuild;
00183         unsigned int Parameter_MaxSymbols_Final;
00185         int Parameter_MaxSubstringSize;
00187         bool Parameter_OnlyCodeWholeWords;
00189         bool Parameter_SpanWordBoundaries;
00190         // When encoding single lines, each line must end with an end-of-stream (EOS) symbol
00191         //  but if we train on a file, which is large and has only one EOS at end of file
00192         //  then the freq. of EOS will be very low (1), so here we can say to count a pretend EOS
00193         //  whenever we see a CR.
00194         bool Parameter_CountCRsAsEOTs;
00196         unsigned int Parameter_PruneMinimumWeight;
00198         int Parameter_ParseMode;
00200         bool Parameter_UseSmartLookup;
00203         bool Parameter_PruneReCalculations;
00204 protected:
00206         int symbolcount;
00208         int endofstreamsymbolnum;
00210         bitreader *current_bitreaderp;
00212         size_t start_bitreaderposition;
00213 public:
00214         //---------------------------------------------------------------------------
00216         SubstringParser();
00218         ~SubstringParser();
00219         //---------------------------------------------------------------------------
00220 public:
00221         //---------------------------------------------------------------------------
00222         // PARSER PUBLIC API - PREPARATIONS FOR PARSING
00223         virtual bool CreateSymbolSetUsingStream(bitreader &from);
00224         virtual bool IsReadyToParse() {if (symbolcount>0) return true; else return false;};
00225         virtual bool RewindAnyBufferedInput(bitreader &from);
00226         //---------------------------------------------------------------------------
00227 public:
00228         //---------------------------------------------------------------------------
00229         // PARSER PUBLIC API - MAIN PARSING FUNCTIONS
00231         virtual void SynchronizeStateForNewStream();
00232         virtual int GetSymbolCount() {return symbolcount;};
00233         virtual bool ParseNextSymbolFromInput(bitreader &from, int &symbolnum);
00234         virtual bool WriteSymbolText(bitwriter &to, int symbolnum,bool &isendofstreamsymbol);
00235         virtual string LookupSymbolText(int symbolnum) {assert(symbolnum<(int)(symbolvector.size()));return symbolvector[symbolnum]->get_value();};
00236         //---------------------------------------------------------------------------
00237 public:
00238         //---------------------------------------------------------------------------
00239         // OCTANE PUBLIC API - AUXILIARY FUNCTIONS - these are supported by all derived classes
00240         virtual void ShowDebuggingInfo();
00241         virtual unsigned int GetMemoryUsed();
00242         virtual unsigned int GetDiskspaceUsed(bool fortempdecompressiononly);
00243         virtual std::string GetParametersInformation();
00244         virtual void SetDefaultParameters();
00245         virtual bool SetParameter(const std::string &parametername,const std::string &parametervalue);
00246         virtual bool SaveState(bitwriter &to,bool fortempdecompressiononly);
00247         virtual bool LoadState(bitreader &from);
00248         //---------------------------------------------------------------------------
00249 public:
00250         //---------------------------------------------------------------------------
00251         // OCTANE PARSER PUBLIC API - AUXILIARY FUNCTIONS - symbol helpers
00252         int GetSetSymbolCount() {return (int)(symbolset.size());};
00253         int LookupSymbolNumberFromText(string wordstring);
00254         //---------------------------------------------------------------------------
00255 private:
00256         // save and load state information
00259         bool SaveSymbols(bitwriter &to);
00262         bool LoadSymbols(bitreader &from);
00263 private:
00264         // symbol manipulation
00266         void AddPrimitiveCharacterSubstringSymbols();
00270         bool UpdateValueStringInSymbolSet(const string &stringvalue,int increment);
00272         void AddSubstringsFromSlidingWindowString(string &slidingwindowstring);
00274         SubstringSymbol* AddSymbol(const std::string &stringvalue, const TSubStrParserWeight weightvalue) {SubstringSymbol* p=new SubstringSymbol(stringvalue,weightvalue);if (p!=NULL) symbolset.insert(p);return p;}
00275 private:
00276         // internal save/load helper functions
00279         bool SaveParameters(bitwriter &to);
00282         bool LoadParameters(bitreader &from);
00285         bool ReadSubstringSymbol(bitreader &from);
00288         bool WriteSubstringSymbol(SubstringSymbol *SubstringSymbolp,bitwriter &to);
00289 private:
00291         void ResetWeights();
00292 private:
00293         // Prune symbols from the set
00295         void PruneSymbolSet();
00297         void PruneSymbolSet_ReduceSizeTo(unsigned int targetsize);
00299         void PruneSymbolSet_WeightThreshold(TSubStrParserWeight thresholdweight);
00301         void PruneSymbolSet_RemoveRedundantPrefixes();
00304         double ReParseFileReComputeCosts(bool updateweights);
00305 private:
00308         double GetSymbolCost(SubstringSymbol* symbolnodep) {return symbolnodep->get_cost();};
00309 private:
00310         // Internal encoding and finding symbols to encode
00313         SubstringSymbol* FindNextSymbolToEncode(char *inputqueuestr,int inputqueuestrlen);
00317         SubstringSymbol* FindNextSymbolToEncode_Greedy(char *inputqueuestr,int inputqueuestrlen);
00319         int SwallowSymbolFromInputQueueStr(SubstringSymbol *symbolp,char *inputqueuestr,int inputqueuestrlen,bool debugmode);
00322         double CalculateEncodeCost(char *inputqueuestr,int inputqueuestrlen);
00323 private:
00325         void BuildSymbolVector();
00326 private:
00328         void FreeData();
00330         void FreeData_Symbols();
00331 };
00332 //---------------------------------------------------------------------------
00333 
00334 
00335 
00336 
00337 
00338 //---------------------------------------------------------------------------
00339 #endif
00340 //---------------------------------------------------------------------------
00341 
 
Generated on 20 May 2004 by doxygen 1.3.3