Inja 3.5.0
A Template Engine for Modern C++
Loading...
Searching...
No Matches
lexer.hpp
1#ifndef INCLUDE_INJA_LEXER_HPP_
2#define INCLUDE_INJA_LEXER_HPP_
3
4#include <cctype>
5#include <cstddef>
6#include <string_view>
7
8#include "config.hpp"
9#include "exceptions.hpp"
10#include "token.hpp"
11#include "utils.hpp"
12
13namespace inja {
14
18class Lexer {
19 enum class State {
20 Text,
21 ExpressionStart,
22 ExpressionStartForceLstrip,
23 ExpressionBody,
24 LineStart,
25 LineBody,
26 StatementStart,
27 StatementStartNoLstrip,
28 StatementStartForceLstrip,
29 StatementBody,
30 CommentStart,
31 CommentStartForceLstrip,
32 CommentBody,
33 };
34
35 enum class MinusState {
36 Operator,
37 Number,
38 };
39
40 const LexerConfig& config;
41
42 State state;
43 MinusState minus_state;
44 std::string_view m_in;
45 size_t tok_start;
46 size_t pos;
47
48 Token scan_body(std::string_view close, Token::Kind closeKind, std::string_view close_trim = std::string_view(), bool trim = false) {
49 again:
50 // skip whitespace (except for \n as it might be a close)
51 if (tok_start >= m_in.size()) {
52 return make_token(Token::Kind::Eof);
53 }
54 const char ch = m_in[tok_start];
55 if (ch == ' ' || ch == '\t' || ch == '\r') {
56 tok_start += 1;
57 goto again;
58 }
59
60 // check for close
61 if (!close_trim.empty() && inja::string_view::starts_with(m_in.substr(tok_start), close_trim)) {
62 state = State::Text;
63 pos = tok_start + close_trim.size();
64 const Token tok = make_token(closeKind);
65 skip_whitespaces_and_newlines();
66 return tok;
67 }
68
69 if (inja::string_view::starts_with(m_in.substr(tok_start), close)) {
70 state = State::Text;
71 pos = tok_start + close.size();
72 const Token tok = make_token(closeKind);
73 if (trim) {
74 skip_whitespaces_and_first_newline();
75 }
76 return tok;
77 }
78
79 // skip \n
80 if (ch == '\n') {
81 tok_start += 1;
82 goto again;
83 }
84
85 pos = tok_start + 1;
86 if (std::isalpha(ch)) {
87 minus_state = MinusState::Operator;
88 return scan_id();
89 }
90
91 const MinusState current_minus_state = minus_state;
92 if (minus_state == MinusState::Operator) {
93 minus_state = MinusState::Number;
94 }
95
96 switch (ch) {
97 case '+':
98 return make_token(Token::Kind::Plus);
99 case '-':
100 if (current_minus_state == MinusState::Operator) {
101 return make_token(Token::Kind::Minus);
102 }
103 return scan_number();
104 case '*':
105 return make_token(Token::Kind::Times);
106 case '/':
107 return make_token(Token::Kind::Slash);
108 case '^':
109 return make_token(Token::Kind::Power);
110 case '%':
111 return make_token(Token::Kind::Percent);
112 case '.':
113 return make_token(Token::Kind::Dot);
114 case ',':
115 return make_token(Token::Kind::Comma);
116 case ':':
117 return make_token(Token::Kind::Colon);
118 case '|':
119 return make_token(Token::Kind::Pipe);
120 case '(':
121 return make_token(Token::Kind::LeftParen);
122 case ')':
123 minus_state = MinusState::Operator;
124 return make_token(Token::Kind::RightParen);
125 case '[':
126 return make_token(Token::Kind::LeftBracket);
127 case ']':
128 minus_state = MinusState::Operator;
129 return make_token(Token::Kind::RightBracket);
130 case '{':
131 return make_token(Token::Kind::LeftBrace);
132 case '}':
133 minus_state = MinusState::Operator;
134 return make_token(Token::Kind::RightBrace);
135 case '>':
136 if (pos < m_in.size() && m_in[pos] == '=') {
137 pos += 1;
138 return make_token(Token::Kind::GreaterEqual);
139 }
140 return make_token(Token::Kind::GreaterThan);
141 case '<':
142 if (pos < m_in.size() && m_in[pos] == '=') {
143 pos += 1;
144 return make_token(Token::Kind::LessEqual);
145 }
146 return make_token(Token::Kind::LessThan);
147 case '=':
148 if (pos < m_in.size() && m_in[pos] == '=') {
149 pos += 1;
150 return make_token(Token::Kind::Equal);
151 }
152 return make_token(Token::Kind::Unknown);
153 case '!':
154 if (pos < m_in.size() && m_in[pos] == '=') {
155 pos += 1;
156 return make_token(Token::Kind::NotEqual);
157 }
158 return make_token(Token::Kind::Unknown);
159 case '\"':
160 return scan_string();
161 case '0':
162 case '1':
163 case '2':
164 case '3':
165 case '4':
166 case '5':
167 case '6':
168 case '7':
169 case '8':
170 case '9':
171 minus_state = MinusState::Operator;
172 return scan_number();
173 case '_':
174 case '@':
175 case '$':
176 minus_state = MinusState::Operator;
177 return scan_id();
178 default:
179 return make_token(Token::Kind::Unknown);
180 }
181 }
182
183 Token scan_id() {
184 for (;;) {
185 if (pos >= m_in.size()) {
186 break;
187 }
188 const char ch = m_in[pos];
189 if (!std::isalnum(ch) && ch != '.' && ch != '/' && ch != '_' && ch != '-') {
190 break;
191 }
192 pos += 1;
193 }
194 return make_token(Token::Kind::Id);
195 }
196
197 Token scan_number() {
198 for (;;) {
199 if (pos >= m_in.size()) {
200 break;
201 }
202 const char ch = m_in[pos];
203 // be very permissive in lexer (we'll catch errors when conversion happens)
204 if (!(std::isdigit(ch) || ch == '.' || ch == 'e' || ch == 'E' || (ch == '+' && (pos == 0 || m_in[pos-1] == 'e' || m_in[pos-1] == 'E')) || (ch == '-' && (pos == 0 || m_in[pos-1] == 'e' || m_in[pos-1] == 'E')))) {
205 break;
206 }
207 pos += 1;
208 }
209 return make_token(Token::Kind::Number);
210 }
211
212 Token scan_string() {
213 bool escape {false};
214 for (;;) {
215 if (pos >= m_in.size()) {
216 break;
217 }
218 const char ch = m_in[pos++];
219 if (ch == '\\') {
220 escape = !escape;
221 } else if (!escape && ch == m_in[tok_start]) {
222 break;
223 } else {
224 escape = false;
225 }
226 }
227 return make_token(Token::Kind::String);
228 }
229
230 Token make_token(Token::Kind kind) const {
231 return Token(kind, string_view::slice(m_in, tok_start, pos));
232 }
233
234 void skip_whitespaces_and_newlines() {
235 if (pos < m_in.size()) {
236 while (pos < m_in.size() && (m_in[pos] == ' ' || m_in[pos] == '\t' || m_in[pos] == '\n' || m_in[pos] == '\r')) {
237 pos += 1;
238 }
239 }
240 }
241
242 void skip_whitespaces_and_first_newline() {
243 if (pos < m_in.size()) {
244 while (pos < m_in.size() && (m_in[pos] == ' ' || m_in[pos] == '\t')) {
245 pos += 1;
246 }
247 }
248
249 if (pos < m_in.size()) {
250 const char ch = m_in[pos];
251 if (ch == '\n') {
252 pos += 1;
253 } else if (ch == '\r') {
254 pos += 1;
255 if (pos < m_in.size() && m_in[pos] == '\n') {
256 pos += 1;
257 }
258 }
259 }
260 }
261
262 static std::string_view clear_final_line_if_whitespace(std::string_view text) {
263 std::string_view result = text;
264 while (!result.empty()) {
265 const char ch = result.back();
266 if (ch == ' ' || ch == '\t') {
267 result.remove_suffix(1);
268 } else if (ch == '\n' || ch == '\r') {
269 break;
270 } else {
271 return text;
272 }
273 }
274 return result;
275 }
276
277public:
278 explicit Lexer(const LexerConfig& config): config(config), state(State::Text), minus_state(MinusState::Number), tok_start(0), pos(0) {}
279
280 SourceLocation current_position() const {
281 return get_source_location(m_in, tok_start);
282 }
283
284 void start(std::string_view input) {
285 m_in = input;
286 tok_start = 0;
287 pos = 0;
288 state = State::Text;
289 minus_state = MinusState::Number;
290
291 // Consume byte order mark (BOM) for UTF-8
292 if (inja::string_view::starts_with(m_in, "\xEF\xBB\xBF")) {
293 m_in = m_in.substr(3);
294 }
295 }
296
297 Token scan() {
298 tok_start = pos;
299
300 again:
301 if (tok_start >= m_in.size()) {
302 return make_token(Token::Kind::Eof);
303 }
304
305 switch (state) {
306 default:
307 case State::Text: {
308 // fast-scan to first open character
309 const size_t open_start = m_in.substr(pos).find_first_of(config.open_chars);
310 if (open_start == std::string_view::npos) {
311 // didn't find open, return remaining text as text token
312 pos = m_in.size();
313 return make_token(Token::Kind::Text);
314 }
315 pos += open_start;
316
317 // try to match one of the opening sequences, and get the close
318 const std::string_view open_str = m_in.substr(pos);
319 bool must_lstrip = false;
320 if (inja::string_view::starts_with(open_str, config.expression_open)) {
321 if (inja::string_view::starts_with(open_str, config.expression_open_force_lstrip)) {
322 state = State::ExpressionStartForceLstrip;
323 must_lstrip = true;
324 } else {
325 state = State::ExpressionStart;
326 }
327 } else if (inja::string_view::starts_with(open_str, config.statement_open)) {
328 if (inja::string_view::starts_with(open_str, config.statement_open_no_lstrip)) {
329 state = State::StatementStartNoLstrip;
330 } else if (inja::string_view::starts_with(open_str, config.statement_open_force_lstrip)) {
331 state = State::StatementStartForceLstrip;
332 must_lstrip = true;
333 } else {
334 state = State::StatementStart;
335 must_lstrip = config.lstrip_blocks;
336 }
337 } else if (inja::string_view::starts_with(open_str, config.comment_open)) {
338 if (inja::string_view::starts_with(open_str, config.comment_open_force_lstrip)) {
339 state = State::CommentStartForceLstrip;
340 must_lstrip = true;
341 } else {
342 state = State::CommentStart;
343 must_lstrip = config.lstrip_blocks;
344 }
345 } else if ((pos == 0 || m_in[pos - 1] == '\n') && inja::string_view::starts_with(open_str, config.line_statement)) {
346 state = State::LineStart;
347 } else {
348 pos += 1; // wasn't actually an opening sequence
349 goto again;
350 }
351
352 std::string_view text = string_view::slice(m_in, tok_start, pos);
353 if (must_lstrip) {
354 text = clear_final_line_if_whitespace(text);
355 }
356
357 if (text.empty()) {
358 goto again; // don't generate empty token
359 }
360 return Token(Token::Kind::Text, text);
361 }
362 case State::ExpressionStart: {
363 state = State::ExpressionBody;
364 pos += config.expression_open.size();
365 return make_token(Token::Kind::ExpressionOpen);
366 }
367 case State::ExpressionStartForceLstrip: {
368 state = State::ExpressionBody;
369 pos += config.expression_open_force_lstrip.size();
370 return make_token(Token::Kind::ExpressionOpen);
371 }
372 case State::LineStart: {
373 state = State::LineBody;
374 pos += config.line_statement.size();
375 return make_token(Token::Kind::LineStatementOpen);
376 }
377 case State::StatementStart: {
378 state = State::StatementBody;
379 pos += config.statement_open.size();
380 return make_token(Token::Kind::StatementOpen);
381 }
382 case State::StatementStartNoLstrip: {
383 state = State::StatementBody;
384 pos += config.statement_open_no_lstrip.size();
385 return make_token(Token::Kind::StatementOpen);
386 }
387 case State::StatementStartForceLstrip: {
388 state = State::StatementBody;
389 pos += config.statement_open_force_lstrip.size();
390 return make_token(Token::Kind::StatementOpen);
391 }
392 case State::CommentStart: {
393 state = State::CommentBody;
394 pos += config.comment_open.size();
395 return make_token(Token::Kind::CommentOpen);
396 }
397 case State::CommentStartForceLstrip: {
398 state = State::CommentBody;
399 pos += config.comment_open_force_lstrip.size();
400 return make_token(Token::Kind::CommentOpen);
401 }
402 case State::ExpressionBody:
403 return scan_body(config.expression_close, Token::Kind::ExpressionClose, config.expression_close_force_rstrip);
404 case State::LineBody:
405 return scan_body("\n", Token::Kind::LineStatementClose);
406 case State::StatementBody:
407 return scan_body(config.statement_close, Token::Kind::StatementClose, config.statement_close_force_rstrip, config.trim_blocks);
408 case State::CommentBody: {
409 // fast-scan to comment close
410 const size_t end = m_in.substr(pos).find(config.comment_close);
411 if (end == std::string_view::npos) {
412 pos = m_in.size();
413 return make_token(Token::Kind::Eof);
414 }
415
416 // Check for trim pattern
417 const bool must_rstrip = inja::string_view::starts_with(m_in.substr(pos + end - 1), config.comment_close_force_rstrip);
418
419 // return the entire comment in the close token
420 state = State::Text;
421 pos += end + config.comment_close.size();
422 Token tok = make_token(Token::Kind::CommentClose);
423
424 if (must_rstrip || config.trim_blocks) {
425 skip_whitespaces_and_first_newline();
426 }
427 return tok;
428 }
429 }
430 }
431
432 const LexerConfig& get_config() const {
433 return config;
434 }
435};
436
437} // namespace inja
438
439#endif // INCLUDE_INJA_LEXER_HPP_
Class for lexing an inja Template.
Definition lexer.hpp:18
Class for lexer configuration.
Definition config.hpp:15
Definition exceptions.hpp:10
Helper-class for the inja Lexer.
Definition token.hpp:12