- <?php
- class Tokenizer {
- protected $string;
- protected $tokens;
- protected $captureDoubleQuotes = false;
- protected $foundTokens = array();
- public function __construct($string = null) {
- $this->load($string);
- }
- public function load($string) {
- $this->string = $string;
- }
- public function loadFile($fileName) {
- if (is_file($fileName)) {
- $this->string = file_get_contents($fileName);
- }
- }
- public function tokens(array $tokens) {
- $this->tokens = $tokens;
- }
- public function captureDoubleQuotes($bool) {
- $this->captureDoubleQuotes = $bool;
- }
- public function parse() {
- $length = strlen($this->string);
- $currentToken = '';
- $quotes = false;
- $idx=0;
- while ($idx < $length) {
- $char = $this->string[$idx];
- if ($quotes) {
- $currentToken .= $char;
- $nextChar = '';
- if (isset($this->string[($idx+1)])) {
- $nextChar = $this->string[($idx+1)];
- }
- if ($char=="\"" && $nextChar !="\"") {
- if ($this->captureDoubleQuotes) {
- $this->addToken($currentToken, $idx - strlen($currentToken)+1, strlen($currentToken));
- }
- $quotes = false;
- }
- } else {
- if ($char=="\"") {
- $quotes = true;
- $currentToken = $char;
- } else {
- if (($char==' ' || $char== "\t") && !$quotes) {
- $currentToken = '';
- } else {
- $currentToken .= $char;
- if (in_array($currentToken, $this->tokens)) {
- $this->addToken($currentToken, $idx - strlen($currentToken)+1, strlen($currentToken));
- $currentToken = '';
- }
- }
- }
- }
- ++$idx;
- }
- return $this->foundTokens;
- }
- protected function addToken($token, $pos, $length) {
- $this->foundTokens[] = array(
- 'token' => $token,
- 'position' => $pos,
- 'length' => $length
- );
- }
- }
- // Usage, copy-paste the Tokenizer class code into a file "code.txt"
- // Use the Tokenizer class to get user defined tokens from the code.
- // the parse() metheod will turn an array with sub arrays, that contain
- // the matching token, postion and length
- $tokenizer = new Tokenizer();
- $tokenizer->loadFile("code.txt");
- $tokenizer->tokens(array("public","function","{","}","while","file_get_contents"));
- $tokenizer->captureDoubleQuotes(true);
- print_R( $tokenizer->parse() );
- ?>
PHP - Tokenizer Class
This class can tokenize a string into user defined tokens. Supply an array with all the tokens to capture. The parse() method will return all matched tokens with it's position and length. It can also capture strings in double quotes.
No comments:
Post a Comment