Skip to content

Commit d53a5da

Browse files
committed
Tokenize comment contents
This will allow context-dependent handling of special characters such as '%' in urls or parsing verbatim environments without resorting to search/replace preprocessing.
1 parent bbdf03e commit d53a5da

File tree

4 files changed

+53
-66
lines changed

4 files changed

+53
-66
lines changed

library/PhpLatex/Lexer.php

+3-35
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,14 @@ class PhpLatex_Lexer
88
const STATE_BSLASH = 1;
99
const STATE_CONTROL = 2;
1010
const STATE_SPACE = 3;
11-
const STATE_COMMENT = 4;
1211

1312
const TYPE_TEXT = 'text';
1413
const TYPE_SPACE = 'space';
1514
const TYPE_CWORD = 'cword';
1615
const TYPE_CSYMBOL = 'csymbol';
1716
const TYPE_SPECIAL = 'special';
17+
18+
/** @deprecated */
1819
const TYPE_COMMENT = 'comment';
1920

2021
protected $_str;
@@ -96,30 +97,6 @@ public function next()
9697
$buf = '';
9798

9899
do {
99-
// special handling for comments - if we're in the comment state parse everything up to first newline
100-
// no need to match it char by char
101-
if ($this->_state === self::STATE_COMMENT) {
102-
// at this point $this->_pos points to first char after '%' which started the comment.
103-
// _line and _column still point to position of '%'
104-
// The \G assertion is true only when the current matching position is at the start
105-
// point of the match, as specified by the offset argument.
106-
// https://www.php.net/manual/en/regexp.reference.escape.php
107-
preg_match('#\G(?<comment>.*)#', $this->_str, $matches, 0, $this->_pos);
108-
109-
if (strlen($matches['comment'])) {
110-
$this->_column++; // normally column would be incremented in _getChar()
111-
$this->storeTokenPosition();
112-
113-
// adjust counters, so that call to _getChar()
114-
$this->_pos += strlen($matches['comment']);
115-
$this->_column += strlen($matches['comment']) - 1;
116-
117-
return $this->_setToken(self::TYPE_COMMENT, $matches['comment']);
118-
} else {
119-
$this->_state = self::STATE_DEFAULT;
120-
}
121-
}
122-
123100
$c = $this->_getChar();
124101

125102
switch ($c) {
@@ -246,9 +223,7 @@ public function next()
246223

247224
$this->storeTokenPosition();
248225

249-
$token = $this->_setToken(self::TYPE_SPECIAL, '%');
250-
$this->_state = self::STATE_COMMENT;
251-
return $token;
226+
return $this->_setToken(self::TYPE_SPECIAL, '%');
252227

253228
case self::STATE_BSLASH:
254229
return $this->_setToken(self::TYPE_CSYMBOL, '\\%');
@@ -261,13 +236,6 @@ public function next()
261236
case self::STATE_SPACE:
262237
$this->_ungetChar();
263238
return $this->_setSpaceToken($buf);
264-
265-
case self::STATE_COMMENT:
266-
if (!strlen($buf)) {
267-
$this->storeTokenPosition();
268-
}
269-
$buf .= $c;
270-
break;
271239
}
272240
break;
273241

library/PhpLatex/Parser.php

+28-26
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ protected function _parseExpr($state, $environ = null) // {{{
276276
return $this->_parseText($token, $state);
277277

278278
case PhpLatex_Lexer::TYPE_COMMENT:
279-
$this->_skipSpaces();
279+
$this->_skipSpacesAndComments();
280280
break;
281281

282282
default:
@@ -513,7 +513,7 @@ protected function _parseControl($token, $mode, $environ = null) // {{{
513513
// Skip all spaces and comments occurring after this token, if this
514514
// token is a control word.
515515
if ($token['type'] === PhpLatex_Lexer::TYPE_CWORD) {
516-
$this->_skipSpaces();
516+
$this->_skipSpacesAndComments();
517517
}
518518

519519
$mathWrapper = null;
@@ -624,29 +624,32 @@ protected function _parseControl($token, $mode, $environ = null) // {{{
624624
*
625625
* After this function has run current token, if exists, is neither space
626626
* nor comment.
627-
*
628-
* @return array skipped SPACE and COMMENT tokens
629627
*/
630-
protected function _skipSpaces()
628+
protected function _skipSpacesAndComments($inComment = false)
631629
{
632-
$skipped = array();
633630
while ($next = $this->_peek()) {
634-
if ($next['type'] === PhpLatex_Lexer::TYPE_SPACE ||
635-
$next['type'] === PhpLatex_Lexer::TYPE_COMMENT ||
636-
($next['type'] === PhpLatex_Lexer::TYPE_SPECIAL && $next['value'] === '%')
637-
) {
638-
$skipped[] = $next;
639-
$this->_next();
631+
if ($inComment) {
632+
if (isset($next['raw']) && strpos($next['raw'], "\n") !== false) {
633+
$inComment = false;
634+
} else {
635+
$this->_next();
636+
}
640637
} else {
641-
break;
638+
if ($next['type'] === PhpLatex_Lexer::TYPE_SPECIAL && $next['value'] === '%') {
639+
$inComment = true;
640+
$this->_next();
641+
} else if ($next['type'] === PhpLatex_Lexer::TYPE_SPACE) {
642+
$this->_next();
643+
} else {
644+
break;
645+
}
642646
}
643647
}
644-
return $skipped;
645648
}
646649

647650
protected function _parseArg($mode, $environ, $parseArgs = true) // {{{
648651
{
649-
$this->_skipSpaces();
652+
$this->_skipSpacesAndComments();
650653

651654
if ($next = $this->_peek()) {
652655
switch ($next['type']) {
@@ -761,7 +764,7 @@ protected function _parseArg($mode, $environ, $parseArgs = true) // {{{
761764
*/
762765
protected function _parseOptArg($state, $environ) // {{{
763766
{
764-
$this->_skipSpaces();
767+
$this->_skipSpacesAndComments();
765768

766769
if (($next = $this->_peek()) &&
767770
($next['type'] === PhpLatex_Lexer::TYPE_SPECIAL) &&
@@ -789,16 +792,11 @@ protected function _parseOptArg($state, $environ) // {{{
789792
*/
790793
protected function _parseEnvName() // {{{
791794
{
792-
while (false !== ($next = $this->_peek())) {
793-
if ($next['type'] === PhpLatex_Lexer::TYPE_SPACE ||
794-
$next['type'] === PhpLatex_Lexer::TYPE_COMMENT ||
795-
($next['type'] === PhpLatex_Lexer::TYPE_SPECIAL && $next['value'] === '%')
796-
) {
797-
// 1. skip spaces and comments
798-
$this->_next();
799-
continue;
795+
// 1. Skip spaces and comments
796+
$this->_skipSpacesAndComments();
800797

801-
} elseif ($next['value'] !== '{') {
798+
while (false !== ($next = $this->_peek())) {
799+
if ($next['value'] !== '{') {
802800
// 2A. first encountered non-space token is not a curly bracket
803801
// Since start of group was expected, this token breaks opening
804802
// of an environment. Give it back and report failure.
@@ -959,6 +957,10 @@ protected function _parseSpecial($token, $state, $environ) // {{{
959957
$node->value = $value;
960958
return $node;
961959

960+
case '%':
961+
$this->_skipSpacesAndComments(true);
962+
break;
963+
962964
case '#':
963965
// currently not supported
964966
break;
@@ -981,7 +983,7 @@ protected function _parseLeftRight($token, $mode, $environs)
981983

982984
$environs = (array) $environs;
983985

984-
$this->_skipSpaces();
986+
$this->_skipSpacesAndComments();
985987
$next = $this->_peek();
986988
if (!$next) {
987989
return false;

tests/PhpLatex/Test/LexerTest.php

+21-4
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ public function testComment()
1010
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'A', 'line' => 1, 'column' => 1),
1111
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 1, 'column' => 2),
1212
array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '%', 'line' => 1, 'column' => 3),
13-
array('type' => PhpLatex_Lexer::TYPE_COMMENT, 'value' => ' comment', 'line' => 1, 'column' => 4),
13+
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 1, 'column' => 4),
14+
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'comment', 'line' => 1, 'column' => 5),
1415
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => "\n", 'line' => 1, 'column' => 12),
1516
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'B', 'line' => 2, 'column' => 1),
1617
)
@@ -49,7 +50,8 @@ public function testCommentOnly()
4950
}
5051
$this->assertEquals(array(
5152
array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '%', 'line' => 1, 'column' => 1),
52-
array('type' => PhpLatex_Lexer::TYPE_COMMENT, 'value' => ' A', 'line' => 1, 'column' => 2),
53+
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 1, 'column' => 2),
54+
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'A', 'line' => 1, 'column' => 3),
5355
), $tokens);
5456
}
5557

@@ -96,12 +98,27 @@ public function testTokens()
9698
array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '}', 'line' => 5, 'column' => 15),
9799
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 5, 'column' => 16),
98100
array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '%', 'line' => 5, 'column' => 17),
99-
array('type' => PhpLatex_Lexer::TYPE_COMMENT, 'value' => ' comment in math mode', 'line' => 5, 'column' => 18),
101+
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 5, 'column' => 18),
102+
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'comment', 'line' => 5, 'column' => 19),
103+
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 5, 'column' => 26),
104+
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'in', 'line' => 5, 'column' => 27),
105+
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 5, 'column' => 29),
106+
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'math', 'line' => 5, 'column' => 30),
107+
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 5, 'column' => 34),
108+
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'mode', 'line' => 5, 'column' => 35),
100109
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => "\n", 'line' => 5, 'column' => 39),
101110
array('type' => PhpLatex_Lexer::TYPE_CSYMBOL, 'value' => '\]', 'line' => 6, 'column' => 1),
102111
array('type' => PhpLatex_Lexer::TYPE_CWORD, 'value' => '\par', 'raw' => "\n\n", 'line' => 6, 'column' => 3),
103112
array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '%', 'line' => 8, 'column' => 1),
104-
array('type' => PhpLatex_Lexer::TYPE_COMMENT, 'value' => '% Comment in text mode', 'line' => 8, 'column' => 2),
113+
array('type' => PhpLatex_Lexer::TYPE_SPECIAL, 'value' => '%', 'line' => 8, 'column' => 2),
114+
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 8, 'column' => 3),
115+
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'Comment', 'line' => 8, 'column' => 4),
116+
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 8, 'column' => 11),
117+
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'in', 'line' => 8, 'column' => 12),
118+
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 8, 'column' => 14),
119+
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'text', 'line' => 8, 'column' => 15),
120+
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => ' ', 'line' => 8, 'column' => 19),
121+
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'mode', 'line' => 8, 'column' => 20),
105122
array('type' => PhpLatex_Lexer::TYPE_SPACE, 'value' => ' ', 'raw' => "\n", 'line' => 8, 'column' => 24),
106123
array('type' => PhpLatex_Lexer::TYPE_TEXT, 'value' => 'End.', 'line' => 9, 'column' => 1),
107124
)

tests/PhpLatex/Test/Renderer/AbstractTest.php

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ public function testIssue6()
4141
\eta_{12} \\\\
4242
\eta_{21} \\\\
4343
\eta_2
44-
\end{array}
44+
\end{array} % comment is here
4545
\]';
4646

4747
$parser = new PhpLatex_Parser();

0 commit comments

Comments
 (0)