Inja  3.4.0
A Template Engine for Modern C++
lexer.hpp
1 #ifndef INCLUDE_INJA_LEXER_HPP_
2 #define INCLUDE_INJA_LEXER_HPP_
3 
4 #include <cctype>
5 #include <locale>
6 
7 #include "config.hpp"
8 #include "token.hpp"
9 #include "utils.hpp"
10 
11 namespace inja {
12 
16 class Lexer {
17  enum class State {
18  Text,
19  ExpressionStart,
20  ExpressionStartForceLstrip,
21  ExpressionBody,
22  LineStart,
23  LineBody,
24  StatementStart,
25  StatementStartNoLstrip,
26  StatementStartForceLstrip,
27  StatementBody,
28  CommentStart,
29  CommentStartForceLstrip,
30  CommentBody,
31  };
32 
33  enum class MinusState {
34  Operator,
35  Number,
36  };
37 
38  const LexerConfig& config;
39 
40  State state;
41  MinusState minus_state;
42  std::string_view m_in;
43  size_t tok_start;
44  size_t pos;
45 
46  Token scan_body(std::string_view close, Token::Kind closeKind, std::string_view close_trim = std::string_view(), bool trim = false) {
47  again:
48  // skip whitespace (except for \n as it might be a close)
49  if (tok_start >= m_in.size()) {
50  return make_token(Token::Kind::Eof);
51  }
52  const char ch = m_in[tok_start];
53  if (ch == ' ' || ch == '\t' || ch == '\r') {
54  tok_start += 1;
55  goto again;
56  }
57 
58  // check for close
59  if (!close_trim.empty() && inja::string_view::starts_with(m_in.substr(tok_start), close_trim)) {
60  state = State::Text;
61  pos = tok_start + close_trim.size();
62  const Token tok = make_token(closeKind);
63  skip_whitespaces_and_newlines();
64  return tok;
65  }
66 
67  if (inja::string_view::starts_with(m_in.substr(tok_start), close)) {
68  state = State::Text;
69  pos = tok_start + close.size();
70  const Token tok = make_token(closeKind);
71  if (trim) {
72  skip_whitespaces_and_first_newline();
73  }
74  return tok;
75  }
76 
77  // skip \n
78  if (ch == '\n') {
79  tok_start += 1;
80  goto again;
81  }
82 
83  pos = tok_start + 1;
84  if (std::isalpha(ch)) {
85  minus_state = MinusState::Operator;
86  return scan_id();
87  }
88 
89  const MinusState current_minus_state = minus_state;
90  if (minus_state == MinusState::Operator) {
91  minus_state = MinusState::Number;
92  }
93 
94  switch (ch) {
95  case '+':
96  return make_token(Token::Kind::Plus);
97  case '-':
98  if (current_minus_state == MinusState::Operator) {
99  return make_token(Token::Kind::Minus);
100  }
101  return scan_number();
102  case '*':
103  return make_token(Token::Kind::Times);
104  case '/':
105  return make_token(Token::Kind::Slash);
106  case '^':
107  return make_token(Token::Kind::Power);
108  case '%':
109  return make_token(Token::Kind::Percent);
110  case '.':
111  return make_token(Token::Kind::Dot);
112  case ',':
113  return make_token(Token::Kind::Comma);
114  case ':':
115  return make_token(Token::Kind::Colon);
116  case '(':
117  return make_token(Token::Kind::LeftParen);
118  case ')':
119  minus_state = MinusState::Operator;
120  return make_token(Token::Kind::RightParen);
121  case '[':
122  return make_token(Token::Kind::LeftBracket);
123  case ']':
124  minus_state = MinusState::Operator;
125  return make_token(Token::Kind::RightBracket);
126  case '{':
127  return make_token(Token::Kind::LeftBrace);
128  case '}':
129  minus_state = MinusState::Operator;
130  return make_token(Token::Kind::RightBrace);
131  case '>':
132  if (pos < m_in.size() && m_in[pos] == '=') {
133  pos += 1;
134  return make_token(Token::Kind::GreaterEqual);
135  }
136  return make_token(Token::Kind::GreaterThan);
137  case '<':
138  if (pos < m_in.size() && m_in[pos] == '=') {
139  pos += 1;
140  return make_token(Token::Kind::LessEqual);
141  }
142  return make_token(Token::Kind::LessThan);
143  case '=':
144  if (pos < m_in.size() && m_in[pos] == '=') {
145  pos += 1;
146  return make_token(Token::Kind::Equal);
147  }
148  return make_token(Token::Kind::Unknown);
149  case '!':
150  if (pos < m_in.size() && m_in[pos] == '=') {
151  pos += 1;
152  return make_token(Token::Kind::NotEqual);
153  }
154  return make_token(Token::Kind::Unknown);
155  case '\"':
156  return scan_string();
157  case '0':
158  case '1':
159  case '2':
160  case '3':
161  case '4':
162  case '5':
163  case '6':
164  case '7':
165  case '8':
166  case '9':
167  minus_state = MinusState::Operator;
168  return scan_number();
169  case '_':
170  case '@':
171  case '$':
172  minus_state = MinusState::Operator;
173  return scan_id();
174  default:
175  return make_token(Token::Kind::Unknown);
176  }
177  }
178 
179  Token scan_id() {
180  for (;;) {
181  if (pos >= m_in.size()) {
182  break;
183  }
184  const char ch = m_in[pos];
185  if (!std::isalnum(ch) && ch != '.' && ch != '/' && ch != '_' && ch != '-') {
186  break;
187  }
188  pos += 1;
189  }
190  return make_token(Token::Kind::Id);
191  }
192 
193  Token scan_number() {
194  for (;;) {
195  if (pos >= m_in.size()) {
196  break;
197  }
198  const char ch = m_in[pos];
199  // be very permissive in lexer (we'll catch errors when conversion happens)
200  if (!(std::isdigit(ch) || ch == '.' || ch == 'e' || ch == 'E' || (ch == '+' && (pos == 0 || m_in[pos-1] == 'e' || m_in[pos-1] == 'E')) || (ch == '-' && (pos == 0 || m_in[pos-1] == 'e' || m_in[pos-1] == 'E')))) {
201  break;
202  }
203  pos += 1;
204  }
205  return make_token(Token::Kind::Number);
206  }
207 
208  Token scan_string() {
209  bool escape {false};
210  for (;;) {
211  if (pos >= m_in.size()) {
212  break;
213  }
214  const char ch = m_in[pos++];
215  if (ch == '\\') {
216  escape = !escape;
217  } else if (!escape && ch == m_in[tok_start]) {
218  break;
219  } else {
220  escape = false;
221  }
222  }
223  return make_token(Token::Kind::String);
224  }
225 
226  Token make_token(Token::Kind kind) const {
227  return Token(kind, string_view::slice(m_in, tok_start, pos));
228  }
229 
230  void skip_whitespaces_and_newlines() {
231  if (pos < m_in.size()) {
232  while (pos < m_in.size() && (m_in[pos] == ' ' || m_in[pos] == '\t' || m_in[pos] == '\n' || m_in[pos] == '\r')) {
233  pos += 1;
234  }
235  }
236  }
237 
238  void skip_whitespaces_and_first_newline() {
239  if (pos < m_in.size()) {
240  while (pos < m_in.size() && (m_in[pos] == ' ' || m_in[pos] == '\t')) {
241  pos += 1;
242  }
243  }
244 
245  if (pos < m_in.size()) {
246  const char ch = m_in[pos];
247  if (ch == '\n') {
248  pos += 1;
249  } else if (ch == '\r') {
250  pos += 1;
251  if (pos < m_in.size() && m_in[pos] == '\n') {
252  pos += 1;
253  }
254  }
255  }
256  }
257 
258  static std::string_view clear_final_line_if_whitespace(std::string_view text) {
259  std::string_view result = text;
260  while (!result.empty()) {
261  const char ch = result.back();
262  if (ch == ' ' || ch == '\t') {
263  result.remove_suffix(1);
264  } else if (ch == '\n' || ch == '\r') {
265  break;
266  } else {
267  return text;
268  }
269  }
270  return result;
271  }
272 
273 public:
274  explicit Lexer(const LexerConfig& config): config(config), state(State::Text), minus_state(MinusState::Number) {}
275 
276  SourceLocation current_position() const {
277  return get_source_location(m_in, tok_start);
278  }
279 
280  void start(std::string_view input) {
281  m_in = input;
282  tok_start = 0;
283  pos = 0;
284  state = State::Text;
285  minus_state = MinusState::Number;
286 
287  // Consume byte order mark (BOM) for UTF-8
288  if (inja::string_view::starts_with(m_in, "\xEF\xBB\xBF")) {
289  m_in = m_in.substr(3);
290  }
291  }
292 
293  Token scan() {
294  tok_start = pos;
295 
296  again:
297  if (tok_start >= m_in.size()) {
298  return make_token(Token::Kind::Eof);
299  }
300 
301  switch (state) {
302  default:
303  case State::Text: {
304  // fast-scan to first open character
305  const size_t open_start = m_in.substr(pos).find_first_of(config.open_chars);
306  if (open_start == std::string_view::npos) {
307  // didn't find open, return remaining text as text token
308  pos = m_in.size();
309  return make_token(Token::Kind::Text);
310  }
311  pos += open_start;
312 
313  // try to match one of the opening sequences, and get the close
314  std::string_view open_str = m_in.substr(pos);
315  bool must_lstrip = false;
316  if (inja::string_view::starts_with(open_str, config.expression_open)) {
317  if (inja::string_view::starts_with(open_str, config.expression_open_force_lstrip)) {
318  state = State::ExpressionStartForceLstrip;
319  must_lstrip = true;
320  } else {
321  state = State::ExpressionStart;
322  }
323  } else if (inja::string_view::starts_with(open_str, config.statement_open)) {
324  if (inja::string_view::starts_with(open_str, config.statement_open_no_lstrip)) {
325  state = State::StatementStartNoLstrip;
326  } else if (inja::string_view::starts_with(open_str, config.statement_open_force_lstrip)) {
327  state = State::StatementStartForceLstrip;
328  must_lstrip = true;
329  } else {
330  state = State::StatementStart;
331  must_lstrip = config.lstrip_blocks;
332  }
333  } else if (inja::string_view::starts_with(open_str, config.comment_open)) {
334  if (inja::string_view::starts_with(open_str, config.comment_open_force_lstrip)) {
335  state = State::CommentStartForceLstrip;
336  must_lstrip = true;
337  } else {
338  state = State::CommentStart;
339  must_lstrip = config.lstrip_blocks;
340  }
341  } else if ((pos == 0 || m_in[pos - 1] == '\n') && inja::string_view::starts_with(open_str, config.line_statement)) {
342  state = State::LineStart;
343  } else {
344  pos += 1; // wasn't actually an opening sequence
345  goto again;
346  }
347 
348  std::string_view text = string_view::slice(m_in, tok_start, pos);
349  if (must_lstrip) {
350  text = clear_final_line_if_whitespace(text);
351  }
352 
353  if (text.empty()) {
354  goto again; // don't generate empty token
355  }
356  return Token(Token::Kind::Text, text);
357  }
358  case State::ExpressionStart: {
359  state = State::ExpressionBody;
360  pos += config.expression_open.size();
361  return make_token(Token::Kind::ExpressionOpen);
362  }
363  case State::ExpressionStartForceLstrip: {
364  state = State::ExpressionBody;
365  pos += config.expression_open_force_lstrip.size();
366  return make_token(Token::Kind::ExpressionOpen);
367  }
368  case State::LineStart: {
369  state = State::LineBody;
370  pos += config.line_statement.size();
371  return make_token(Token::Kind::LineStatementOpen);
372  }
373  case State::StatementStart: {
374  state = State::StatementBody;
375  pos += config.statement_open.size();
376  return make_token(Token::Kind::StatementOpen);
377  }
378  case State::StatementStartNoLstrip: {
379  state = State::StatementBody;
380  pos += config.statement_open_no_lstrip.size();
381  return make_token(Token::Kind::StatementOpen);
382  }
383  case State::StatementStartForceLstrip: {
384  state = State::StatementBody;
385  pos += config.statement_open_force_lstrip.size();
386  return make_token(Token::Kind::StatementOpen);
387  }
388  case State::CommentStart: {
389  state = State::CommentBody;
390  pos += config.comment_open.size();
391  return make_token(Token::Kind::CommentOpen);
392  }
393  case State::CommentStartForceLstrip: {
394  state = State::CommentBody;
395  pos += config.comment_open_force_lstrip.size();
396  return make_token(Token::Kind::CommentOpen);
397  }
398  case State::ExpressionBody:
399  return scan_body(config.expression_close, Token::Kind::ExpressionClose, config.expression_close_force_rstrip);
400  case State::LineBody:
401  return scan_body("\n", Token::Kind::LineStatementClose);
402  case State::StatementBody:
403  return scan_body(config.statement_close, Token::Kind::StatementClose, config.statement_close_force_rstrip, config.trim_blocks);
404  case State::CommentBody: {
405  // fast-scan to comment close
406  const size_t end = m_in.substr(pos).find(config.comment_close);
407  if (end == std::string_view::npos) {
408  pos = m_in.size();
409  return make_token(Token::Kind::Eof);
410  }
411 
412  // Check for trim pattern
413  const bool must_rstrip = inja::string_view::starts_with(m_in.substr(pos + end - 1), config.comment_close_force_rstrip);
414 
415  // return the entire comment in the close token
416  state = State::Text;
417  pos += end + config.comment_close.size();
418  Token tok = make_token(Token::Kind::CommentClose);
419 
420  if (must_rstrip || config.trim_blocks) {
421  skip_whitespaces_and_first_newline();
422  }
423  return tok;
424  }
425  }
426  }
427 
428  const LexerConfig& get_config() const {
429  return config;
430  }
431 };
432 
433 } // namespace inja
434 
435 #endif // INCLUDE_INJA_LEXER_HPP_
Class for lexing an inja Template.
Definition: lexer.hpp:16
Class for lexer configuration.
Definition: config.hpp:14
Definition: exceptions.hpp:9
Helper-class for the inja Lexer.
Definition: token.hpp:12