I have two problems with the following SQL grammar:
#define BOOST_SPIRIT_QI_DEBUG
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/spirit/include/karma.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/fusion/include/std_pair.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/shared_ptr.hpp>
#include <boost/make_shared.hpp>
#include <boost/lexical_cast.hpp>
#include <iostream>
#include <fstream>
#include <string>
#include <set>
#include <utility>
namespace bs = boost::spirit;
namespace lex = boost::spirit::lex;
namespace qi = boost::spirit::qi;
namespace phx = boost::phoenix;
// Token definition base, defines all tokens for the base grammar below
template <typename Lexer>
struct sql_tokens : lex::lexer<Lexer>
{
public:
// Tokens with no attributes.
lex::token_def<lex::omit> type_smallint, type_int, type_varchar, type_text, type_date;
lex::token_def<lex::omit> kw_not_null, kw_auto_increment, kw_unique, kw_default, kw_create,
kw_table, kw_constraint, kw_primary_key;
// Attributed tokens. (If you add a new type, don't forget to add it to the lex::lexertl::token definition too).
lex::token_def<int> signed_digit;
lex::token_def<std::size_t> unsigned_digit;
lex::token_def<std::string> identifier;
lex::token_def<std::string> quoted_string;
sql_tokens()
{
// Column data types.
type_smallint = "(?i:smallint)";
type_int = "(?i:int)";
type_varchar = "(?i:varchar)";
type_text = "(?i:text)";
type_date = "(?i:date)";
// Keywords.
kw_not_null = "(?i:not +null)";
kw_auto_increment = "(?i:auto_increment)";
kw_unique = "(?i:unique)";
kw_default = "(?i:default)";
kw_create = "(?i:create)";
kw_table = "(?i:table)";
kw_constraint = "(?i:constraint)";
kw_primary_key = "(?i:primary +key)";
// Values.
signed_digit = "[+-]?[0-9]+";
unsigned_digit = "[0-9]+";
quoted_string = "\\\"(\\\\.|[^\\\"])*\\\""; // \"(\\.|[^\"])*\"
// Identifier.
identifier = "[a-zA-Z][a-zA-Z0-9_]*";
// The token must be added in priority order.
this->self += lex::token_def<>('(') | ')' | ',' | ';';
this->self += type_smallint | type_int | type_varchar | type_text |
type_date;
this->self += kw_not_null | kw_auto_increment | kw_unique | kw_default |
kw_create | kw_table | kw_constraint | kw_primary_key;
this->self += identifier | unsigned_digit | signed_digit | quoted_string;
// define the whitespace to ignore.
this->self("WS")
= lex::token_def<>("[ \\t\\n]+")
| "--[^\\n]*\\n" // Single line comments with --
| "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/" // C-style comments
;
}
};
// Grammar definition, define a little part of the SQL language.
template <typename Iterator, typename Lexer>
struct sql_grammar
: qi::grammar<Iterator, qi::in_state_skipper<Lexer> >
{
template <typename TokenDef>
sql_grammar(TokenDef const& tok)
: sql_grammar::base_type(program, "program")
{
program
= (statement % ';') >> *qi::lit(';')
;
statement
= create_statement.alias()
;
create_statement
= tok.kw_create >> create_table
;
create_table
= tok.kw_table >> tok.identifier >> '(' >> create_table_columns >> -(',' >> table_constraints) >> ')'
;
table_constraints
= constraint_definition % ','
;
constraint_definition
= tok.kw_constraint >> tok.identifier >> primary_key_constraint
;
primary_key_constraint
= tok.kw_primary_key >> '(' >> (tok.identifier % ',') >> ')'
;
create_table_columns
= column_definition % ','
;
column_definition
= tok.identifier >> column_type >> *type_constraint
;
type_constraint
= tok.kw_not_null
| tok.kw_auto_increment
| tok.kw_unique
| default_value
;
default_value
= tok.kw_default > tok.quoted_string
;
column_type
= tok.type_smallint
| tok.type_int
| (tok.type_varchar > '(' > tok.unsigned_digit > ')')
| tok.type_text
| tok.type_date
;
program.name("program");
statement.name("statement");
create_statement.name("create statement");
create_table.name("create table");
create_table_columns.name("create table columns");
column_definition.name("column definition");
column_type.name("column type");
default_value.name("default value");
type_constraint.name("type constraint");
table_constraints.name("table constraints");
constraint_definition.name("constraint definition");
primary_key_constraint.name("primary key constraint");
BOOST_SPIRIT_DEBUG_NODE(program);
BOOST_SPIRIT_DEBUG_NODE(statement);
BOOST_SPIRIT_DEBUG_NODE(create_statement);
BOOST_SPIRIT_DEBUG_NODE(create_table);
BOOST_SPIRIT_DEBUG_NODE(create_table_columns);
BOOST_SPIRIT_DEBUG_NODE(column_definition);
BOOST_SPIRIT_DEBUG_NODE(column_type);
BOOST_SPIRIT_DEBUG_NODE(default_value);
BOOST_SPIRIT_DEBUG_NODE(type_constraint);
BOOST_SPIRIT_DEBUG_NODE(table_constraints);
BOOST_SPIRIT_DEBUG_NODE(constraint_definition);
BOOST_SPIRIT_DEBUG_NODE(primary_key_constraint);
using namespace qi::labels;
qi::on_error<qi::fail>
(
program,
std::cout
<< phx::val("Error! Expecting ")
<< bs::_4 // what failed?
<< phx::val(" here: \"")
<< phx::construct<std::string>(bs::_3, bs::_2) // iterators to error-pos, end
<< phx::val("\"")
<< std::endl
);
}
private:
typedef qi::in_state_skipper<Lexer> skipper_type;
typedef qi::rule<Iterator, skipper_type> simple_rule;
simple_rule program, statement, create_statement, create_table, table_constraints, constraint_definition;
simple_rule primary_key_constraint, create_table_columns, column_definition, type_constraint, default_value, column_type;
};
std::string file2string(const std::string& filename)
{
std::ifstream s(filename.c_str(), std::ios_base::binary);
std::stringstream ss;
ss << s.rdbuf();
return ss.str();
}
int main(int argc, char* argv[])
{
if(argc != 2)
{
std::cerr << "usage: " << argv[0] << " schema_filename\n";
return 1;
}
// iterator type used to expose the underlying input stream
typedef std::string::iterator base_iterator_type;
// This is the lexer token type to use.
typedef lex::lexertl::token<
base_iterator_type, boost::mpl::vector<int, std::size_t, std::string>
> token_type;
// Here we use the lexertl based lexer engine.
typedef lex::lexertl::lexer<token_type> lexer_type;
// This is the token definition type (derived from the given lexer type).
typedef sql_tokens<lexer_type> sql_tokens;
// this is the iterator type exposed by the lexer
typedef sql_tokens::iterator_type iterator_type;
// this is the type of the grammar to parse
typedef sql_grammar<iterator_type, sql_tokens::lexer_def> sql_grammar;
// now we use the types defined above to create the lexer and grammar
// object instances needed to invoke the parsing process
sql_tokens tokens; // Our lexer
sql_grammar sql(tokens); // Our parser
std::string str(file2string(argv[1]));
// At this point we generate the iterator pair used to expose the
// tokenized input stream.
base_iterator_type it = str.begin();
iterator_type iter = tokens.begin(it, str.end());
iterator_type end = tokens.end();
// Parsing is done based on the the token stream, not the character
// stream read from the input.
// Note how we use the lexer defined above as the skip parser. It must
// be explicitly wrapped inside a state directive, switching the lexer
// state for the duration of skipping whitespace.
std::string ws("WS");
bool r = qi::phrase_parse(iter, end, sql, qi::in_state(ws)[tokens.self]);
if (r && iter == end)
{
std::cout << "-------------------------\n";
std::cout << "Parsing succeeded\n";
std::cout << "-------------------------\n";
}
else
{
std::cout << "-------------------------\n";
std::cout << "Parsing failed\n";
std::cout << "-------------------------\n";
}
return 0;
}
Problem 1: Start with comments
When the file start with a comment, the parsing immediately fails:
/* bouh */
CREATE TABLE mytable (
id int NOT NULL AUTO_INCREMENT
);
With this failing tree:
<program>
<try>[/]</try>
<statement>
<try>[/]</try>
<create_statement>
<try>[/]</try>
<fail/>
</create_statement>
<fail/>
</statement>
<fail/>
</program>
But if I add a line return just before, it works. Both type of comments ("--" and "/**/") fail.
Problem 2: Keyword unique not recognized
The parsing fails under very specific condition with the keyword unique. It's not working when unique is in upper case and directly followed by a comma.
All the following cases succeed:
-- Success
CREATE TABLE Addon (
id int NOT NULL AUTO_INCREMENT,
u smallint NOT NULL UNIQUE
);
-- Success
CREATE TABLE Addon (
id int NOT NULL AUTO_INCREMENT,
u smallint NOT NULL unique,
s int NOT NULL UNIQUE
);
-- Success
CREATE TABLE Addon (
id int NOT NULL AUTO_INCREMENT,
u smallint NOT NULL UNIQUE ,
s int NOT NULL UNIQUE
);
-- Success
CREATE TABLE Addon (
id int NOT NULL AUTO_INCREMENT,
u smallint UNIQUE NOT NULL,
s int NOT NULL UNIQUE
);
But this one doesn't:
-- Fail
CREATE TABLE Addon (
id int NOT NULL AUTO_INCREMENT,
u smallint NOT NULL UNIQUE,
s int NOT NULL
);
Do you have any ideas of what is wrong? Thanks!
Regarding the whitespace skipping I can only conclude that pre-skipping is not being done initially (perhaps the state is not switched correctly).
Of course, you could try to remedy this using theI misrembered the API, you could only do this with manual tokenization, which precludes the state switching by Qi in the first place.lex::tokenize_and_parse
API (passing the initial state as "WS").However, what I tend to do is make skipping the responsibility of the lexer:
Now there is no need to use a skipper at all, and this succeeds in parsing the first problem (starting with a comment).
Full code: Live On Coliru
Look for
#ifdef STATE_WS