;;; -*- lexical-binding: t -*- ;;; published under CC0 into the public domain ;;; author: philip k. [https://zge.us.to], 2019 (require 'cl-lib) (defun earley-tokenize (tokenizer) "Process buffer by sequentially collecting all string that match the regular expression `tokenizer'. The regular expression MUST contain at at least one group, since only what matches the first group will be added to the list, while the rest is ignored. There has to exist at least one possibility to cover the ENTIRE possibility with this regular expression, otherwise an error is signalled. All properties from the buffer will be removed." (let (tokens) (save-excursion (goto-char (point-min)) (while (not (eobp)) (save-match-data (unless (looking-at tokenizer) (error "Found unexpected token at %d:%d" (line-number-at-pos) (- (point) (line-beginning-position)))) (when (match-beginning 1) (push (match-string-no-properties 1) tokens)) (goto-char (match-end 0))))) (nreverse tokens))) (defvar earley--grammar nil) (defvar earley--state nil) (cl-defstruct earley--item (position 0) (progress 0) (rule (error "no rule") :read-only t) completing parse-tree) (defun earley--equiv (it1 it2) (and (eq (earley--item-position it1) (earley--item-position it2)) (eq (earley--item-progress it1) (earley--item-progress it2)) (eq (earley--item-rule it1) (earley--item-rule it2)))) (defun earley--finished-p (item) (<= (length (earley--item-rule item)) (earley--item-progress item))) (defun earley--at-item (item) (unless (earley--finished-p item) (aref (earley--item-rule item) (earley--item-progress item)))) (defun earley--at-non-terminal-p (item) (not (stringp (earley--at-item item)))) (defun earley--create-parse-tree (item tokens) (cons (earley--item-completing item) (if (earley--item-parse-tree item) (mapcar (lambda (i) (earley--create-parse-tree i tokens)) (reverse (earley--item-parse-tree item))) (cl-subseq tokens (earley--item-position item) (+ (earley--item-position item) (earley--item-progress item)))))) (defun earley--completor (old-item) (cl-loop with rule-name = (earley--item-completing old-item) for item in (aref earley--state (earley--item-position old-item)) when (eq (earley--at-item item) rule-name) collect (let ((new-item (copy-earley--item item))) (cl-incf (earley--item-progress new-item)) (push old-item (earley--item-parse-tree new-item)) new-item))) (defun earley--predictor (item k) (cl-loop with st-name = (earley--at-item item) for rule across earley--grammar when (eq (car rule) st-name) collect (make-earley--item :position k :completing st-name :rule (cdr rule)))) (defun earley--scanner (k tok) (cl-loop for item in (aref earley--state k) for at = (earley--at-item item) when (stringp at) when (string-match-p (concat "^\\(" at "\\)$") tok) collect (let ((new-item (copy-earley--item item))) (cl-incf (earley--item-progress new-item)) new-item))) (defun earley-parse (tokens grammar &optional start) "Process list of string-tokens `tokens' by using context-free grammar `grammar' with the start-symbol `start'. `Grammar' has to be a vector of rules. Each rule is a cons cell of a symbol as a non-terminal identifier and a vector describing the rule. This vector may consist of other symbols, standing in for non-terminals and strings that stand in for terminals. Each string is taken to be a regular expression, to make things easier. If `start' is not passed, it is assumed to be the symbol 'start itself. `earley-parse' returns a parse tree, consisting of cons cells, where the car is a non-terminal symbol, and the cdr a list of children it took to resolve the non-terminal. If a rule had no children, it instead returns the a list of the strings of the tokens it resolved." (let ((init (make-symbol "*init*")) (earley--grammar grammar) (earley--state (make-vector (1+ (length tokens)) nil))) (aset earley--state 0 (list (make-earley--item :rule (vector (or start 'start)) :completing init))) (cl-loop with item = nil for tok in (append tokens '(nil)) for k upfrom 0 for current-state = (copy-sequence (aref earley--state k)) always current-state do (while (setq item (pop current-state)) (cond ((earley--finished-p item) (dolist (new (earley--completor item)) (unless (cl-member new (aref earley--state k) :test #'earley--equiv) (push new (aref earley--state k)) (push new current-state)))) ((earley--at-non-terminal-p item) (dolist (new (earley--predictor item k)) (unless (cl-member new (aref earley--state k) :test #'earley--equiv) (push new (aref earley--state k)) (push new current-state)))) (tok (dolist (new (earley--scanner k tok)) (cl-pushnew new (aref earley--state (1+ k)) :test #'earley--equiv)))))) (cl-loop for item in (aref earley--state (1- (length earley--state))) when (eq (earley--item-completing item) init) when (= (earley--item-position item) 0) when (= (earley--item-progress item) (length (earley--item-rule item))) return (cadr (earley--create-parse-tree item tokens))))) (provide 'earley)