/********************************************************************** * xpath.c - A simple XPath parser * Version: 0.3.3 * Date: 2008/1/11 * Author: Kazuhito Hagio * Syntax: Path = Step | Path Step * Step = Axis NodeTest | Axis NodeTest "[" Predicate "]" * Axis = "/" AXIS_NAME "::" | "/" | "//" | "/@" | "//@" * NodeTest = NODE_LABEL | "*" * Predicate = Expr | Expr "and" Predicate * Expr = Path | Path Operator Literal * Operator = "=" | "!=" | "<" | "<=" | ">" | ">=" * Literal = '"' STRING '"' | "'" STRING "'" * WARNING: (1) Do not write any spaces on both sides of the operators * except for "and". (e.g. /root[hoge="literal"]) * (2) "/" is unnecessary after "[". (e.g. /root[@attr]) * (3) NODE_LABEL consists of "[0-9A-Za-z_-]", but must * starts with "[0-9A-Za-z]" only. * TODO: Support the other operators and functions... * Support "." and ".." abbreviated expressions **********************************************************************/ #include "xpath.h" #include #include #include #include #define S_PUSH(c) (par_stack[++par_pos] = (c)) #define S_POP() if (--par_pos < 0) return NULL #define S_REPLACE(c) (par_stack[par_pos] = (c)) #define S_TOP() (par_stack[par_pos]) #define S_PARENT() (par_stack[par_pos-1]) #define QBUFSIZE 64 #define INIT_BUF() memset(buffer, 0, QBUFSIZE); bptr = buffer #define OPHEAD(c) ((c)=='='||(c)=='!'||(c)=='<'||(c)=='>') //||(c)=='+'||(c)=='-'||(c)=='*'||(c)=='|' #define TAGHEAD(c) (isalnum(c)||(c)=='*'||(c)=='_') static int Number_of_QNodes; static int ErrPos; // for debug #define AXISNUM 13 const char *AxisTable[AXISNUM] = { "ancestor", "ancestor-or-self", "attribute", "child", "descendant", "descendant-or-self", "following", "following-sibling", "namespace", "parent", "preceding", "preceding-sibling", "self" }; enum axes check_axis(const char *axis) { int i; for (i = 0; i < AXISNUM; i++) { if (strcmp(axis, AxisTable[i]) == 0) { return i; } } return -1; // Not found } #define OPNUM 7 const char *Operators[OPNUM] = { "None", //"and", "or", "mod", "div", "=", "!=", "<", "<=", ">", ">=" // 1-6 //"+", "-", "*", "|" }; // TODO: Support the other operators... int check_operator(const char *op) { int i; for (i = 1; i < OPNUM; i++) { if (strcmp(Operators[i], op) == 0) { return i; } } return 0; // Not found } QNode *make_node(QNode *parent) { QNode *new; new = (QNode *) malloc(sizeof(QNode)); new->id = Number_of_QNodes++; new->parent = parent; new->axis = CHILD; new->label = NULL; new->num = 0; new->result = 0; memset(new->children, 0, MAX_CHILDREN); new->op = 0; new->literal = NULL; // connect from the parent if (parent != NULL) { parent->children[parent->num] = new; parent->num++; } return new; } QNode *parse_xpath(const char *query) { char c = 0; char b = 0; int state = 0; const char *qptr = query; QNode *new = NULL; QNode *root = NULL; char quote = 0; //enum axes temp_axis = 0; char *bptr, buffer[QBUFSIZE]; // parents stack QNode *par_stack[MAX_QNODES]; int par_pos = 0; // initialize ErrPos = 0; Number_of_QNodes = 0; INIT_BUF(); par_stack[par_pos] = NULL; while ((c = *qptr) != '\0') { switch (state) { case 0: // initial state if (c == '/') { // MUST new = make_node(S_TOP()); S_REPLACE(new); root = new; state = 1; } else { return NULL; } break; case 1: // have read '/' if (TAGHEAD(c)) { *bptr++ = c; state = 3; } else if (c == '/') { new->axis = DESCENDANT; state = 2; } else if (c == '@') { //new->axis = ATTRIBUTE; *bptr++ = c; state = 2; } else { return NULL; } break; case 2: // have read "//", "/@" or "@" if (TAGHEAD(c)) { *bptr++ = c; state = 3; } else if (c == '@' && b == '/') { *bptr++ = c; state = 2; } else { return NULL; } break; case 3: // have read alnum(s) or one '*' if (TAGHEAD(c) || c == '-') { if (b == '*') return NULL; *bptr++ = c; } else if (c == ':') { if (new->axis == CHILD || buffer[0] != '@') { if (check_axis(buffer) == ATTRIBUTE) { INIT_BUF(); *bptr++ = '@'; state = 9; } else { return NULL; } } else { return NULL; } } else { // set node label new->label = strdup(buffer); INIT_BUF(); if (c == '/') { new = make_node(S_TOP()); S_REPLACE(new); state = 1; } else if (c == '[') { // make a new node and push stack new = make_node(S_TOP()); S_PUSH(new); state = 4; } else if (c == ']') { // pop stack S_POP(); state = 5; } else if (par_pos > 0) { // in predicates if (isspace(c)) { state = 6; } else if (OPHEAD(c)) { *bptr++ = c; state = 11; } else { return NULL; } } else { return NULL; } } break; case 4: // have read '[' if (TAGHEAD(c)) { *bptr++ = c; state = 3; } else if (c == '/') { state = 1; } else if (c == '@') { //new->axis = ATTRIBUTE; *bptr++ = c; state = 2; } else { return NULL; } break; case 5: // have read ']' if (c == '/') { new = make_node(S_TOP()); S_REPLACE(new); state = 1; } else if (c == ']') { S_POP(); state = 5; } else if (c == '[') { // consecutive predicates new = make_node(S_TOP()); S_PUSH(new); state = 4; } else if (isspace(c) && par_pos > 0) { state = 6; } else { return NULL; } break; case 6: // have read one space or more if (isspace(c)) { ; // do nothing } else { *bptr++ = c; state = 7; } break; case 7: // reading the operator "and" if (isspace(c)) { if (strcmp(buffer, "and") == 0) { INIT_BUF(); state = 8; } else { return NULL; } } else { *bptr++ = c; } break; case 8: // have read "and" and space(s) if (isspace(c)) { ; } else { new = make_node(S_PARENT()); S_REPLACE(new); if (TAGHEAD(c)) { *bptr++ = c; state = 3; } else if (c == '/') { state = 1; } else if (c == '@') { //new->axis = ATTRIBUTE; *bptr++ = c; state = 2; } else { return NULL; } } break; case 9: // have read one ':' if (c == ':') { state = 10; } else { return NULL; } break; case 10: // have read "::" if (TAGHEAD(c)) { *bptr++ = c; state = 3; } else { return NULL; } break; case 11: // reading non-spaced operators if (c == '"' || c == '\'') { if ((new->op = check_operator(buffer)) > 0) { INIT_BUF(); quote = c; state = 12; } else { return NULL; } } else { *bptr++ = c; } break; case 12: // reading a literal if (c == quote) { new->literal = strdup(buffer); INIT_BUF(); state = 13; } else if (c == '\\') { state = 14; } else { *bptr++ = c; } break; case 13: // have read a literal if (isspace(c)) { state = 6; } else if (c == ']') { S_POP(); state = 5; } else { return NULL; } break; case 14: // reading a escape sequence *bptr++ = c; state = 12; break; default: return NULL; } // previous char b = c; qptr++; // for debug ErrPos++; } /* final state */ if (state == 3) { // set node label and result flag new->label = strdup(buffer); new->result = 1; return root; } else if (state == 5 && par_pos == 0) { // set result flag S_TOP()->result = 1; return root; } return NULL; } void free_tree(QNode *node) { int i; if (node != NULL) { for (i = 0; i < node->num; i++) { free_tree(node->children[i]); } if (node->label != NULL) free(node->label); if (node->literal != NULL) free(node->literal); free(node); } } void print_tree_recur(QNode *node, int depth) { int i; if (node != NULL) { for (i = 0; i < depth * 4; i++) putchar(' '); printf("id:%d label:%s axis:%s num:%d op:%s liter:%s", \ node->id, node->label, AxisTable[node->axis], \ node->num, Operators[node->op], node->literal); if (node->result) printf(" ***RESULT***"); printf("\n"); for (i = 0; i < node->num; i++) { print_tree_recur(node->children[i], depth+1); } } } void print_tree(QNode *node) { print_tree_recur(node, 0); } const char *enum2axis(enum axes axis) { return AxisTable[axis]; } #ifdef _STAND_ALONE int main(int argc, char *argv[]) { QNode *root; int i; if (argc < 2) { fprintf(stderr, "Usage: xpath 'XPath_Query'\n"); return 1; } printf("%s\n", argv[1]); if ((root = parse_xpath(argv[1])) != NULL) { printf("Syntax OK\n"); print_tree(root); } else { for (i = 0; i < ErrPos; i++) putchar(' '); printf("^ here\n"); printf("Syntax Error\n"); } free_tree(root); return 0; } #endif