Main Page   Namespace List   Class Hierarchy   Compound List   File List   Compound Members   File Members  

G:/NewGrep/pattern.hpp

Go to the documentation of this file.
00001 static char version[] = "$Header: /TDS Product Cycle/0 Development/Software/MiscTools/NewGrep/pattern.hpp 3     00-11-03 10:19 Vwagner $";
00002 
00003 /*
00004 Copyright  2000 Teton Data Systems
00005 
00006 Description:
00007 
00008  */
00009 
00010 //! Needed because VS and VC++ (6) don't do namespaces well
00011 typedef std::vector<unsigned short> usv;
00012 
00013 //!Holds a set of states for the pattern match algorithm
00014 class StateSet: public  usv
00015   {
00016   private:
00017   public:
00018 
00019     //!Put a state into the set if it isn't already there
00020     //!\return true if new_state is 0
00021     bool Put(unsigned short new_state)          //!<new state to put in set
00022       {
00023         //! we only put in 'non-zero' states
00024         if (new_state)
00025           {
00026             //! if find doesn't, it returns end()
00027             if (usv::end() == std::find(usv::begin(), usv::end(), new_state))
00028               usv::push_back(new_state);
00029           }
00030 
00031         //! let caller know if we tried to put a 'zero' in
00032         return !new_state;
00033       }
00034 
00035     //!override the standard one because it gives back the current vector
00036     void clear()
00037       {
00038         erase(begin(), end());
00039       }
00040   };
00041 
00042 //!case_independent "pattern match".
00043 /*!
00044     In usage, two calls are made:
00045     The first is to the constructor to 'set' the pattern
00046     Then (possibly) multiple calls to operator () to check strings against the pattern.
00047     For example:
00048     \code
00049     Pattern magic("#?test#?");                  // look for "test" anywhere in the string
00050     if (magic("this is a test"))                // this would test true
00051       cout << "Yes, a match.\n";
00052     if (magic("a rather contrived example"))    // this would test false
00053       cout << "This is never gonna print\n";
00054     else cout << "No copy of \"test\" in that string\n.";
00055     \endcode
00056 
00057     Of course, both calls may be collapsed into a \b single "call" as follows:
00058     \code
00059     if (Pattern("#?test#?")("This is a test, isn't it?"))
00060       cout << "Yes, and successful at that!\n";
00061     \endcode
00062     \note this is not particularly efficient as the Pattern will be destructed at the end of the statment, but for a "quick and dirty" it's probably OK
00063 
00064     \b Pattern is designed to work with the \b STL algorithms as a functor.  For example,
00065     assume a std::vector of string already filled which we wish to test against a pattern.
00066     The following would be one method:
00067     \code
00068     #include <vector>
00069     #include <algorithm>
00070     ...
00071     std::vector<string> text_stuff;
00072     ...                               // code to fill text_stuff
00073     ...
00074     // get an iterator which "points" to the first(1st) string in the vector
00075     std::vector<string>::const_iterator iter(text_stuff.begin());
00076     // output all strings which contain the pattern "this matches"
00077     while (text_stuff.end() != (iter = find(iter, text_stuff.end(), Pattern("#?this'smatches#?"))))
00078       {
00079         cout << *iter;      // output string which 'matches'
00080         ++iter;             // so we start search after this string
00081       }
00082     \endcode
00083     \note we bounded our pattern with #? so that the match could occur anywhere in the target string
00084     \note we also used 's (whitespace) so that we will match either a space (' ') or tab ('\t') between the two words.
00085 */
00086 class Pattern
00087   {
00088   private:
00089     std::string the_Pattern;          //!<the actual pattern we're searching for
00090     unsigned short patp;              //!<working pointer for the match function
00091     char ch;                          //!<current character from the_Pattern
00092     usv Compiled_Pattern;             //!<holds 'next state' info
00093     enum {endstreamch = 0x0d};
00094 
00095     //!get the next character (or 'endstreamch') from the pattern
00096     //!\return the next character (put it in ch also)
00097     inline char rch()
00098       {
00099         if (patp>=the_Pattern.size())       // if we're past the end
00100           return ch=endstreamch;            //   return a marker
00101         return ch=the_Pattern[patp++];      // otherwise return the character AND bump the pointer
00102       }
00103 
00104     //! get the next 'item' (remove squoted characters)
00105     //!\return then next character after skipping a squoted one if neccessary
00106     inline char nextitem()
00107       {
00108         if (ch=='\'')         // if the last thing was a 'squote'
00109           rch();              //   throw away what was squoted
00110         return rch();         // get the next thing
00111       }
00112 
00113     //! replace each element of a list starting at 'list' with val
00114     void setexits(unsigned short list       //!< the start of the list to replace
00115                  ,unsigned short val)       //!< the value to put in the list
00116       {
00117         while (list != 0)
00118           {
00119             unsigned short next(Compiled_Pattern[list]);
00120             Compiled_Pattern[list] = val;
00121             list = next;
00122           }
00123       }
00124 
00125     //! append a value (or list) to a list
00126     //!\return either val (if the list is empty) or list (if it's not)
00127     unsigned short join(unsigned short list   //!< start of list
00128                        ,unsigned short val)   //!< value to append (note this COULD be the start of a new list)
00129       {
00130         if (list == 0) return(val);
00131         unsigned short save_head(list);
00132 
00133         //! follow the list to the end
00134         while (Compiled_Pattern[list] != 0) list = Compiled_Pattern[list];
00135 
00136         //! add new element to the end
00137         Compiled_Pattern[list] = val;
00138         return(save_head);
00139       }
00140 
00141     //!collect stuff until we get an endstreamch or an unmatched )
00142     unsigned short expand (unsigned short altp)
00143       {
00144         unsigned short exits = 0;
00145         unsigned short a;
00146 
00147         for (;;)
00148           {
00149             a = primative();
00150               if ((ch == '`') || (ch == '|') || (ch == ')') || (ch == endstreamch))
00151               {
00152                 exits = join(exits, a);
00153 
00154                   if ( ( ch != '`' ) && ( ch != '|' ) )
00155                   {
00156                     return(exits);
00157                   }
00158 
00159                 Compiled_Pattern[altp] = patp;
00160                 altp = patp;
00161                 nextitem();
00162               }
00163             else                      // must be just an 'item'
00164               {
00165                 setexits(a, patp);    // 
00166               }
00167           }
00168       }
00169 
00170     //!get a primative from the pattern
00171     /*!
00172     \return pointer (well, patp) of the 1st char of the primative
00173 
00174     a primative is:
00175     \li an item
00176     \li a '(' followed by a primative followed by a ')'
00177     \li a '#' followed by a primative
00178     */
00179     unsigned short primative(void)
00180       {
00181       unsigned short a;
00182       char op;
00183 
00184         a = patp;                 // save where we're starting
00185         op = ch;                  // save current ch for later switching
00186         nextitem();               // get the next item (we're gonna consume this one)
00187         switch (op)
00188           {
00189           case '(':
00190             a = expand(a);
00191             if (ch != ')')
00192               throw std::runtime_error("bad pattern: expected )");
00193             nextitem();           // ch == ')' or we would have thrown
00194             return(a);
00195 
00196           case endstreamch:
00197           case ')':
00198           case '`':
00199           case '|':
00200             throw std::runtime_error("bad pattern: unexpected ), `, |, or endstreamch");
00201 
00202           case '#':                   // 0 or more of
00203             setexits(primative(), a); // loop back to try it again
00204           default:
00205             return(a);
00206           }
00207       }
00208 
00209   protected:
00210   public:
00211 
00212     //!construct a Pattern object to search (case-independently)
00213     /*!\verbatim
00214     The following metacharacters are used in the pattern string
00215         ? = matches any single character except newline                   
00216         # = zero or more occurrances of following item               
00217         % = Matches the null string                   
00218        () = enclose multiple items to be considered a single item
00219        'a = any alphabetic character                                      
00220        'd = any digit                                                     
00221        'n = any alphanumeric                                              
00222        's = any white space character                                     
00223         ' = escape (match following metacharacter literally, '$ = $)     
00224         | = or (a|b) = a or b   
00225         ` = or (a`b) = a or b (added to allow easier input from MS command lines)  
00226 
00227       all other characters must be matched exactly
00228      \endverbatim
00229      \note The pattern much match the \b entire string presented to \b operator()()
00230      \note If you mean to look in only \b part of the string, enclose your pattern thusly:
00231      \note \b #?<your_pattern>#?
00232 
00233     */
00234     explicit Pattern(std::string pat)           //!<the pattern we'll be attempting to match
00235       : the_Pattern(pat), patp(0), Compiled_Pattern(pat.size()+1)
00236       {
00237         rch();                          //! prime the pump
00238         setexits(expand(0), 0);         //! generate Compiled_Pattern
00239       }
00240 
00241     bool operator() (const std::string& String_to_Test)
00242       {
00243         StateSet a_states, b_states;
00244         unsigned short Position_to_Test(0);
00245         bool success(false);                      //!<working variable that hold's success
00246         a_states.reserve(the_Pattern.size());
00247         b_states.reserve(the_Pattern.size());
00248         a_states.Put(1);
00249         a_states.Put(Compiled_Pattern[0]);
00250         for(;;)
00251           {
00252             //!for_all a_states .....
00253             for (StateSet::iterator i = a_states.begin(); i != a_states.end(); ++i)
00254               {
00255                 unsigned short p = *i;
00256                 unsigned short q = Compiled_Pattern[p];
00257                 switch (the_Pattern[p-1])
00258                   {
00259                   case '#':
00260                     a_states.Put(p+1);
00261                   case '%':
00262                     success |= a_states.Put(q);
00263                     break;
00264                   case '(':
00265                   case '`':
00266                   case '|':
00267                     a_states.Put(p+1);
00268                     a_states.Put(q);
00269                     break;
00270                   default:
00271                     break;
00272                   } // end switch (the_Pattern[p-1])
00273               } // end for (StateSet::iterator i = a_states.begin(); i != a_states.end(); ++i)
00274 
00275             //! are we done?  test for ONLY two (2) ways out of here
00276             if (Position_to_Test >= String_to_Test.size()) return success;
00277             if (!a_states.size()) return false;
00278 
00279             //!Check the next character in the "input" string
00280             success = false;
00281             b_states.clear();
00282             a_states.swap(b_states);
00283             ch = tolower(String_to_Test[Position_to_Test++]);     //! to make everything 'case independent'
00284 
00285             //!for_all b_states .....
00286             for (i = b_states.begin(); i != b_states.end(); ++i)
00287               {
00288                 unsigned short p = *i;
00289                 switch (char k = tolower(the_Pattern[p-1]))       //! to make everything 'case independent'
00290                   {
00291                   case '#':
00292                   case '`':
00293                   case '|':
00294                   case '%':
00295                   case '(':
00296                     continue;                   // nothing to check here
00297                   case '\'':                    // AHA!! one of our 'special' characters
00298                     switch (k = tolower(the_Pattern[p]))          //! to make everything 'case independent'
00299                       {
00300                       case 'a':
00301                         if (isalpha(ch))
00302                           k = ch;               // so they match later
00303                         break;
00304                       case 'd':
00305                         if (isdigit(ch))
00306                           k = ch;               // so they match later
00307                         else
00308                           k = '0';              // so they canNOT match
00309                         break;
00310                       case 'n':
00311                         if (isalnum(ch))
00312                           k = ch;               // so they match later
00313                         break;
00314                       case 's':
00315                         if (isspace(ch))
00316                           k = ch;               // so they match later
00317                         else
00318                           k = ' ';              // so they canNOT match
00319                       default:;     //! nothing to do for 'non-special' characters
00320                       } // end switch (k = tolower(the_Pattern[p]))
00321 
00322                   //!This is the important part... does the 'pattern' match the 'input string'
00323                   default:
00324                     if (ch == k)
00325                   case '?':     //! remember "?" matches anything
00326                       success |= a_states.Put(Compiled_Pattern[p]);
00327                     continue;
00328                   } // end switch (char k = tolower(the_Pattern[p-1]))
00329               } // end for (i = b_states.begin(); i != b_states.end(); ++i)
00330           } // end for (;;) ...... no way out here
00331       }
00332   };
00333 

Generated at Wed Dec 6 21:25:04 2000 for NewGrep by doxygen1.2.3 written by Dimitri van Heesch, © 1997-2000