;;; esxml-query.el --- select esxml nodes jQuery-style
;; Copyright (C) 2017 Vasilij Schneidermann <mail@vasilij.de>
;; Author: Vasilij Schneidermann <mail@vasilij.de>
;; Maintainer: Vasilij Schneidermann
;; Version: 0.1.1
;; Keywords: data, lisp
;; Package-Requires: ((cl-lib "0.1"))
;;
;; This program is free software; you can redistribute it and/or
;; modify it under the terms of the GNU General Public License as
;; published by the Free Software Foundation, either version 3 of the
;; License, or (at your option) any later version.
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
;; along with this program. If not, see <http://www.gnu.org/licenses/>.
;;; Commentary:
;; Traditionally people pick one of the following options when faced
;; with the task of extracting data from XML in Emacs Lisp:
;;
;; - Using regular expressions on the unparsed document
;; - Manual tree traversal with `assoc', `car' and `cdr'
;;
;; Browsers faced a similar problem until jQuery happened, shortly
;; afterwards they started providing the `node.querySelector' and
;; `node.querySelectorAll' API for retrieving one or all nodes
;; matching a given CSS selector. This code implements the same API
;; with the `esxml-query' and `esxml-query-all' functions. The
;; following table summarizes the currently supported modifiers and
;; combinators:
;;
;; | Name | Supported? | Syntax |
;; |------------------------------------+------------+-------------|
;; | Namespaces | No | foo|bar |
;; | Commas | Yes | foo, bar |
;; | Descendant combinator | Yes | foo bar |
;; | Child combinator | Yes | foo>bar |
;; | Adjacent sibling combinator | No | foo+bar |
;; | General sibling combinator | No | foo~bar |
;; | Universal selector | Yes | * |
;; | Type selector | Yes | tag |
;; | ID selector | Yes | #foo |
;; | Class selector | Yes | .foo |
;; | Attribute selector | Yes | [foo] |
;; | Exact match attribute selector | Yes | [foo=bar] |
;; | Prefix match attribute selector | Yes | [foo^=bar] |
;; | Suffix match attribute selector | Yes | [foo$=bar] |
;; | Substring match attribute selector | Yes | [foo*=bar] |
;; | Include match attribute selector | Yes | [foo~=bar] |
;; | Dash match attribute selector | Yes | [foo|=bar] |
;; | Attribute selector modifiers | No | [foo=bar i] |
;; | Pseudo elements | No | ::foo |
;; | Pseudo classes | No | :foo |
;;; Code:
;;; CSS selector parsing
;; https://www.w3.org/TR/selectors/#w3cselgrammar
;; https://www.w3.org/TR/selectors4/#grammar
;; https://www.w3.org/TR/2003/WD-css3-syntax-20030813/#detailed-grammar
;; https://www.w3.org/TR/2003/WD-css3-syntax-20030813/#tokenization
;; you might be wondering why I'm using both level 3 and 4 standards,
;; well, the level 3 one has a buggy lexer section whereas level 4
;; omits crucial parser definitions, so both have to be used...
;; TODO: support :not
;; the alternative is creating a mutable object with peek/next methods
;; and passing it around, so I chose the one requiring less typing, a
;; dynamically bound variable :<
;; TODO: support :not
;; css-selector:
;; css-selector-list;
;; css-selector-list:
;; complex-css-selector [ comma whitespace* complex-css-selector ]*;
;; complex-css-selector:
;; compound-css-selector [ css-combinator compound-css-selector ]* whitespace*;
;; css-combinator:
;; whitespace+ | whitespace* [ '>' | '+' | '~' ] whitespace*;
;; compound-css-selector:
;; css-type-selector css-modifier* | css-modifier+;
;; css-type-selector:
;; IDENT | *;
;; css-modifier:
;; css-id | css-class | css-attrib | css-pseudo;
;; css-id:
;; HASH;
;; css-class:
;; '.' IDENT;
;; css-attrib:
;; '[' whitespace* css-attrib-name ']'
;; | '[' whitespace* css-attrib-name css-attrib-match css-attrib-value whitespace* ']';
;; css-attrib-name:
;; IDENT whitespace*;
;; css-attrib-match:
;; [ '=' | PREFIX-MATCH | SUFFIX-MATCH | SUBSTRING-MATCH | INCLUDE-MATCH | DASH-MATCH ] whitespace*;
;; css-attrib-value:
;; IDENT | STRING;
;; css-pseudo:
;; ':' ':'? [ IDENT | css-functional-pseudo ];
;; css-functional-pseudo:
;; FUNCTION whitespace* [ css-expression whitespace* ]+ ')';
;; css-expression:
;; '+' | '-' | DIMENSION | NUMBER | STRING | IDENT
;;; tree traversal
;; TODO: these helpers should be part of esxml.el
;;; querying
;; NOTE: supporting structural pseudo functions, direct siblings and
;; indirect siblings requires breadth instead of depth traversal,
;; something that could be emulated without zippers if you had the
;; parent of the node (and the position of the child)...
;;; esxml-query.el ends here