PHP - Tokenizer Class

This class can tokenize a string into user defined tokens. Supply an array with all the tokens to capture. The parse() method will return all matched tokens with it's position and length. It can also capture strings in double quotes.
  1. <?php      
  2.      
  3.     class Tokenizer {      
  4.           
  5.         protected $string;      
  6.         protected $tokens;      
  7.         protected $captureDoubleQuotes = false;      
  8.         protected $foundTokens = array();      
  9.               
  10.         public function __construct($string = null) {      
  11.             $this->load($string);      
  12.         }      
  13.               
  14.         public function load($string) {      
  15.             $this->string = $string;      
  16.         }      
  17.               
  18.         public function loadFile($fileName) {      
  19.             if (is_file($fileName)) {      
  20.                 $this->string = file_get_contents($fileName);      
  21.             }      
  22.         }      
  23.               
  24.         public function tokens(array $tokens) {      
  25.             $this->tokens = $tokens;      
  26.         }      
  27.               
  28.         public function captureDoubleQuotes($bool) {      
  29.             $this->captureDoubleQuotes = $bool;      
  30.         }      
  31.               
  32.         public function parse() {      
  33.             $length = strlen($this->string);      
  34.             $currentToken = '';      
  35.             $quotes = false;      
  36.      
  37.             $idx=0;       
  38.             while ($idx < $length) {      
  39.                       
  40.                 $char = $this->string[$idx];      
  41.      
  42.                 if ($quotes) {      
  43.                     $currentToken .= $char;      
  44.      
  45.                     $nextChar = '';      
  46.                     if (isset($this->string[($idx+1)])) {      
  47.                         $nextChar = $this->string[($idx+1)];      
  48.                     }      
  49.                           
  50.                     if ($char=="\"" && $nextChar !="\"") {      
  51.                         if ($this->captureDoubleQuotes) {      
  52.                             $this->addToken($currentToken, $idx - strlen($currentToken)+1, strlen($currentToken));      
  53.                         }      
  54.                         $quotes = false;      
  55.                     }      
  56.                 } else {      
  57.                       
  58.                     if ($char=="\"") {      
  59.                         $quotes = true;      
  60.                         $currentToken = $char;      
  61.                     } else {      
  62.                           
  63.                         if (($char==' ' || $char== "\t") && !$quotes) {      
  64.                             $currentToken = '';      
  65.                         } else {      
  66.                             $currentToken .= $char;      
  67.                                   
  68.                             if (in_array($currentToken, $this->tokens)) {      
  69.                                 $this->addToken($currentToken, $idx - strlen($currentToken)+1, strlen($currentToken));      
  70.                                 $currentToken = '';      
  71.                             }      
  72.                         }      
  73.                     }      
  74.                 }      
  75.                 ++$idx;      
  76.             }      
  77.             return $this->foundTokens;      
  78.         }      
  79.               
  80.         protected function addToken($token, $pos, $length) {      
  81.             $this->foundTokens[]  = array(      
  82.                 'token' => $token,      
  83.                 'position' => $pos,      
  84.                 'length' => $length      
  85.             );      
  86.         }      
  87.     }      
  88.           
  89.     // Usage, copy-paste the Tokenizer class code into a file "code.txt"      
  90.     // Use the Tokenizer class to get user defined tokens from the code.      
  91.     // the parse() metheod will turn an array with sub arrays, that contain      
  92.     // the matching token, postion and length      
  93.           
  94.     $tokenizer = new Tokenizer();      
  95.     $tokenizer->loadFile("code.txt");      
  96.     $tokenizer->tokens(array("public","function","{","}","while","file_get_contents"));      
  97.     $tokenizer->captureDoubleQuotes(true);      
  98.           
  99.     print_R( $tokenizer->parse() );      
  100. ?>  

No comments:

Post a Comment