* @copyright 2006-2011 Squiz Pty Ltd (ABN 77 084 670 600) * @license http://matrix.squiz.net/developer/tools/php_cs/licence BSD Licence * @link http://pear.php.net/package/PHP_CodeSniffer */ /** * Tokenizes PHP code. * * @category PHP * @package PHP_CodeSniffer * @author Greg Sherwood * @copyright 2006-2011 Squiz Pty Ltd (ABN 77 084 670 600) * @license http://matrix.squiz.net/developer/tools/php_cs/licence BSD Licence * @version Release: 1.3.3 * @link http://pear.php.net/package/PHP_CodeSniffer */ class PHP_CodeSniffer_Tokenizers_PHP { /** * A list of tokens that are allowed to open a scope. * * This array also contains information about what kind of token the scope * opener uses to open and close the scope, if the token strictly requires * an opener, if the token can share a scope closer, and who it can be shared * with. An example of a token that shares a scope closer is a CASE scope. * * @var array */ public $scopeOpeners = array( T_IF => array( 'start' => array(T_OPEN_CURLY_BRACKET), 'end' => array(T_CLOSE_CURLY_BRACKET), 'strict' => false, 'shared' => false, 'with' => array(), ), T_TRY => array( 'start' => array(T_OPEN_CURLY_BRACKET), 'end' => array(T_CLOSE_CURLY_BRACKET), 'strict' => true, 'shared' => false, 'with' => array(), ), T_CATCH => array( 'start' => array(T_OPEN_CURLY_BRACKET), 'end' => array(T_CLOSE_CURLY_BRACKET), 'strict' => true, 'shared' => false, 'with' => array(), ), T_ELSE => array( 'start' => array(T_OPEN_CURLY_BRACKET), 'end' => array(T_CLOSE_CURLY_BRACKET), 'strict' => false, 'shared' => false, 'with' => array(), ), T_ELSEIF => array( 'start' => array(T_OPEN_CURLY_BRACKET), 'end' => array(T_CLOSE_CURLY_BRACKET), 'strict' => false, 'shared' => false, 'with' => array(), ), T_FOR => array( 'start' => array(T_OPEN_CURLY_BRACKET), 'end' => array(T_CLOSE_CURLY_BRACKET), 'strict' => false, 'shared' => false, 'with' => array(), ), T_FOREACH => array( 'start' => array(T_OPEN_CURLY_BRACKET), 'end' => array(T_CLOSE_CURLY_BRACKET), 'strict' => false, 'shared' => false, 'with' => array(), ), T_INTERFACE => array( 'start' => array(T_OPEN_CURLY_BRACKET), 'end' => array(T_CLOSE_CURLY_BRACKET), 'strict' => true, 'shared' => false, 'with' => array(), ), T_FUNCTION => array( 'start' => array(T_OPEN_CURLY_BRACKET), 'end' => array(T_CLOSE_CURLY_BRACKET), 'strict' => true, 'shared' => false, 'with' => array(), ), T_CLASS => array( 'start' => array(T_OPEN_CURLY_BRACKET), 'end' => array(T_CLOSE_CURLY_BRACKET), 'strict' => true, 'shared' => false, 'with' => array(), ), T_NAMESPACE => array( 'start' => array(T_OPEN_CURLY_BRACKET), 'end' => array(T_CLOSE_CURLY_BRACKET), 'strict' => false, 'shared' => false, 'with' => array(), ), T_WHILE => array( 'start' => array(T_OPEN_CURLY_BRACKET), 'end' => array(T_CLOSE_CURLY_BRACKET), 'strict' => false, 'shared' => false, 'with' => array(), ), T_DO => array( 'start' => array(T_OPEN_CURLY_BRACKET), 'end' => array(T_CLOSE_CURLY_BRACKET), 'strict' => true, 'shared' => false, 'with' => array(), ), T_SWITCH => array( 'start' => array(T_OPEN_CURLY_BRACKET), 'end' => array(T_CLOSE_CURLY_BRACKET), 'strict' => true, 'shared' => false, 'with' => array(), ), T_CASE => array( 'start' => array( T_COLON, T_SEMICOLON, ), 'end' => array(T_BREAK), 'strict' => true, 'shared' => true, 'with' => array( T_DEFAULT, T_CASE, T_SWITCH, ), ), T_DEFAULT => array( 'start' => array(T_COLON), 'end' => array(T_BREAK), 'strict' => true, 'shared' => true, 'with' => array( T_CASE, T_SWITCH, ), ), T_START_HEREDOC => array( 'start' => array(T_START_HEREDOC), 'end' => array(T_END_HEREDOC), 'strict' => true, 'shared' => false, 'with' => array(), ), ); /** * A list of tokens that end the scope. * * This array is just a unique collection of the end tokens * from the _scopeOpeners array. The data is duplicated here to * save time during parsing of the file. * * @var array */ public $endScopeTokens = array( T_CLOSE_CURLY_BRACKET, T_BREAK, T_END_HEREDOC, ); /** * Creates an array of tokens when given some PHP code. * * Starts by using token_get_all() but does a lot of extra processing * to insert information about the context of the token. * * @param string $string The string to tokenize. * @param string $eolChar The EOL character to use for splitting strings. * * @return array */ public function tokenizeString($string, $eolChar='\n') { $tokens = @token_get_all($string); $finalTokens = array(); $newStackPtr = 0; $numTokens = count($tokens); for ($stackPtr = 0; $stackPtr < $numTokens; $stackPtr++) { $token = $tokens[$stackPtr]; $tokenIsArray = is_array($token); /* If we are using \r\n newline characters, the \r and \n are sometimes split over two tokens. This normally occurs after comments. We need to merge these two characters together so that our line endings are consistent for all lines. */ if ($tokenIsArray === true && substr($token[1], -1) === "\r") { if (isset($tokens[($stackPtr + 1)]) === true && is_array($tokens[($stackPtr + 1)]) === true && $tokens[($stackPtr + 1)][1][0] === "\n" ) { $token[1] .= "\n"; if ($tokens[($stackPtr + 1)][1] === "\n") { // The next token's content has been merged into this token, // so we can skip it. $stackPtr++; } else { $tokens[($stackPtr + 1)][1] = substr($tokens[($stackPtr + 1)][1], 1); } } }//end if /* If this is a double quoted string, PHP will tokenise the whole thing which causes problems with the scope map when braces are within the string. So we need to merge the tokens together to provide a single string. */ if ($tokenIsArray === false && $token === '"') { $tokenContent = '"'; $nestedVars = array(); for ($i = ($stackPtr + 1); $i < $numTokens; $i++) { $subTokenIsArray = is_array($tokens[$i]); if ($subTokenIsArray === true) { $tokenContent .= $tokens[$i][1]; if ($tokens[$i][1] === '{' && $tokens[$i][0] !== T_ENCAPSED_AND_WHITESPACE ) { $nestedVars[] = $i; } } else { $tokenContent .= $tokens[$i]; if ($tokens[$i] === '}') { array_pop($nestedVars); } } if ($subTokenIsArray === false && $tokens[$i] === '"' && empty($nestedVars) === true ) { // We found the other end of the double quoted string. break; } } $stackPtr = $i; // Convert each line within the double quoted string to a // new token, so it conforms with other multiple line tokens. $tokenLines = explode($eolChar, $tokenContent); $numLines = count($tokenLines); $newToken = array(); for ($j = 0; $j < $numLines; $j++) { $newToken['content'] = $tokenLines[$j]; if ($j === ($numLines - 1)) { if ($tokenLines[$j] === '') { break; } } else { $newToken['content'] .= $eolChar; } $newToken['code'] = T_DOUBLE_QUOTED_STRING; $newToken['type'] = 'T_DOUBLE_QUOTED_STRING'; $finalTokens[$newStackPtr] = $newToken; $newStackPtr++; } // Continue, as we're done with this token. continue; }//end if /* If this is a heredoc, PHP will tokenise the whole thing which causes problems when heredocs don't contain real PHP code, which is almost never. We want to leave the start and end heredoc tokens alone though. */ if ($tokenIsArray === true && $token[0] === T_START_HEREDOC) { // Add the start heredoc token to the final array. $finalTokens[$newStackPtr] = PHP_CodeSniffer::standardiseToken($token); // Check if this is actually a nowdoc and use a different token // to help the sniffs. $nowdoc = false; if ($token[1][3] === "'") { $finalTokens[$newStackPtr]['code'] = T_START_NOWDOC; $finalTokens[$newStackPtr]['type'] = 'T_START_NOWDOC'; $nowdoc = true; } $newStackPtr++; $tokenContent = ''; for ($i = ($stackPtr + 1); $i < $numTokens; $i++) { $subTokenIsArray = is_array($tokens[$i]); if ($subTokenIsArray === true && $tokens[$i][0] === T_END_HEREDOC ) { // We found the other end of the heredoc. break; } if ($subTokenIsArray === true) { $tokenContent .= $tokens[$i][1]; } else { $tokenContent .= $tokens[$i]; } } $stackPtr = $i; // Convert each line within the heredoc to a // new token, so it conforms with other multiple line tokens. $tokenLines = explode($eolChar, $tokenContent); $numLines = count($tokenLines); $newToken = array(); for ($j = 0; $j < $numLines; $j++) { $newToken['content'] = $tokenLines[$j]; if ($j === ($numLines - 1)) { if ($tokenLines[$j] === '') { break; } } else { $newToken['content'] .= $eolChar; } if ($nowdoc === true) { $newToken['code'] = T_NOWDOC; $newToken['type'] = 'T_NOWDOC'; } else { $newToken['code'] = T_HEREDOC; $newToken['type'] = 'T_HEREDOC'; } $finalTokens[$newStackPtr] = $newToken; $newStackPtr++; } // Add the end heredoc token to the final array. $finalTokens[$newStackPtr] = PHP_CodeSniffer::standardiseToken($tokens[$stackPtr]); if ($nowdoc === true) { $finalTokens[$newStackPtr]['code'] = T_END_NOWDOC; $finalTokens[$newStackPtr]['type'] = 'T_END_NOWDOC'; $nowdoc = true; } $newStackPtr++; // Continue, as we're done with this token. continue; }//end if /* If this token has newlines in its content, split each line up and create a new token for each line. We do this so it's easier to asertain where errors occur on a line. Note that $token[1] is the token's content. */ if ($tokenIsArray === true && strpos($token[1], $eolChar) !== false) { $tokenLines = explode($eolChar, $token[1]); $numLines = count($tokenLines); $tokenName = token_name($token[0]); for ($i = 0; $i < $numLines; $i++) { $newToken['content'] = $tokenLines[$i]; if ($i === ($numLines - 1)) { if ($tokenLines[$i] === '') { break; } } else { $newToken['content'] .= $eolChar; } $newToken['type'] = $tokenName; $newToken['code'] = $token[0]; $finalTokens[$newStackPtr] = $newToken; $newStackPtr++; } } else { $newToken = PHP_CodeSniffer::standardiseToken($token); // This is a special condition for T_ARRAY tokens use to // type hint function arguments as being arrays. We want to keep // the parenthsis map clean, so let's tag these tokens as // T_ARRAY_HINT. if ($newToken['code'] === T_ARRAY) { // Recalculate number of tokens. $numTokens = count($tokens); for ($i = $stackPtr; $i < $numTokens; $i++) { if (is_array($tokens[$i]) === false) { if ($tokens[$i] === '(') { break; } } else if ($tokens[$i][0] === T_VARIABLE) { $newToken['code'] = T_ARRAY_HINT; $newToken['type'] = 'T_ARRAY_HINT'; break; } } } $finalTokens[$newStackPtr] = $newToken; $newStackPtr++; }//end if }//end for return $finalTokens; }//end tokenizeString() /** * Performs additional processing after main tokenizing. * * This additional processing checks for CASE statements * that are using curly braces for scope openers and closers. It * also turn some T_FUNCTION tokens into T_CLOSURE when they * are not standard function definitions. * * @param array &$tokens The array of tokens to process. * @param string $eolChar The EOL character to use for splitting strings. * * @return void */ public function processAdditional(&$tokens, $eolChar) { if (PHP_CODESNIFFER_VERBOSITY > 1) { echo "\t*** START ADDITIONAL PHP PROCESSING ***".PHP_EOL; } $numTokens = count($tokens); for ($i = ($numTokens - 1); $i >= 0; $i--) { // Looking for functions that are actually closures. if ($tokens[$i]['code'] === T_FUNCTION && isset($tokens[$i]['scope_opener']) === true) { for ($x = ($i + 1); $x < $numTokens; $x++) { if (in_array($tokens[$x]['code'], PHP_CodeSniffer_Tokens::$emptyTokens) === false) { break; } } if ($tokens[$x]['code'] === T_OPEN_PARENTHESIS) { $tokens[$i]['code'] = T_CLOSURE; $tokens[$i]['type'] = 'T_CLOSURE'; if (PHP_CODESNIFFER_VERBOSITY > 1) { $line = $tokens[$i]['line']; echo "\t* token $i on line $line changed from T_FUNCTION to T_CLOSURE".PHP_EOL; } for ($x = ($tokens[$i]['scope_opener'] + 1); $x < $tokens[$i]['scope_closer']; $x++) { if (isset($tokens[$x]['conditions'][$i]) === false) { continue; } $tokens[$x]['conditions'][$i] = T_CLOSURE; if (PHP_CODESNIFFER_VERBOSITY > 1) { $type = $tokens[$x]['type']; echo "\t\t* cleaned $x ($type) *".PHP_EOL; } } } continue; }//end if if (($tokens[$i]['code'] !== T_CASE && $tokens[$i]['code'] !== T_DEFAULT) || isset($tokens[$i]['scope_opener']) === false ) { // Only interested in CASE and DEFAULT statements // from here on in. continue; } $scopeOpener = $tokens[$i]['scope_opener']; $scopeCloser = $tokens[$i]['scope_closer']; // If the first char after the opener is a curly brace // and that brace has been ignored, it is actually // opening this case statement and the opener and closer are // probably set incorrectly. for ($x = ($scopeOpener + 1); $x < $numTokens; $x++) { if (in_array($tokens[$x]['code'], PHP_CodeSniffer_Tokens::$emptyTokens) === false) { // Non-whitespace content. break; } } if ($tokens[$x]['code'] === T_CASE) { // Special case for multiple CASE statements that // share the same closer. Because we are going // backwards through the file, this next CASE // statement is already fixed, so just use its // closer and don't worry about fixing anything. $newCloser = $tokens[$x]['scope_closer']; $tokens[$i]['scope_closer'] = $newCloser; if (PHP_CODESNIFFER_VERBOSITY > 1) { $oldType = $tokens[$scopeCloser]['type']; $newType = $tokens[$newCloser]['type']; $line = $tokens[$i]['line']; echo "\t* token $i (T_CASE) on line $line closer changed from $scopeCloser ($oldType) to $newCloser ($newType)".PHP_EOL; } continue; } if ($tokens[$x]['code'] !== T_OPEN_CURLY_BRACKET || isset($tokens[$x]['scope_condition']) === true ) { // Not a CASE with a curly brace opener. continue; } // The closer for this CASE/DEFAULT should be the closing // curly brace and not whatever it already is. The opener needs // to be the opening curly brace so everything matches up. $newCloser = $tokens[$x]['bracket_closer']; $tokens[$i]['scope_closer'] = $newCloser; $tokens[$x]['scope_closer'] = $newCloser; $tokens[$i]['scope_opener'] = $x; $tokens[$x]['scope_condition'] = $i; $tokens[$newCloser]['scope_condition'] = $i; $tokens[$newCloser]['scope_opener'] = $x; if (PHP_CODESNIFFER_VERBOSITY > 1) { $line = $tokens[$i]['line']; $tokenType = $tokens[$i]['type']; $oldType = $tokens[$scopeOpener]['type']; $newType = $tokens[$x]['type']; echo "\t* token $i ($tokenType) on line $line opener changed from $scopeOpener ($oldType) to $x ($newType)".PHP_EOL; $oldType = $tokens[$scopeCloser]['type']; $newType = $tokens[$newCloser]['type']; echo "\t* token $i ($tokenType) on line $line closer changed from $scopeCloser ($oldType) to $newCloser ($newType)".PHP_EOL; } // Now fix up all the tokens that think they are // inside the CASE/DEFAULT statement when they are really outside. for ($x = $newCloser; $x < $scopeCloser; $x++) { foreach ($tokens[$x]['conditions'] as $num => $oldCond) { if ($oldCond === $tokens[$i]['code']) { $oldConditions = $tokens[$x]['conditions']; unset($tokens[$x]['conditions'][$num]); if (PHP_CODESNIFFER_VERBOSITY > 1) { $type = $tokens[$x]['type']; $oldConds = ''; foreach ($oldConditions as $condition) { $oldConds .= token_name($condition).','; } $oldConds = rtrim($oldConds, ','); $newConds = ''; foreach ($tokens[$x]['conditions'] as $condition) { $newConds .= token_name($condition).','; } $newConds = rtrim($newConds, ','); echo "\t\t* cleaned $x ($type) *".PHP_EOL; echo "\t\t\t=> conditions changed from $oldConds to $newConds".PHP_EOL; } break; } } } }//end for if (PHP_CODESNIFFER_VERBOSITY > 1) { echo "\t*** END ADDITIONAL PHP PROCESSING ***".PHP_EOL; } }//end processAdditional() }//end class ?>