;;;
;;; This code was written by:
;;;
;;;    Lawrence E. Freil <lef@freil.com>
;;;    National Science Center Foundation
;;;    Augusta, Georgia 30909
;;;
;;; This program was released into the public domain on 2005-08-31.
;;; (See the slime-devel mailing list archive for details.)
;;;
;;; nregex.lisp - My 4/8/92 attempt at a Lisp based regular expression
;;;               parser. 
;;;
;;;               This regular expression parser operates by taking a
;;;               regular expression and breaking it down into a list
;;;               consisting of lisp expressions and flags.  The list
;;;               of lisp expressions is then taken in turned into a
;;;               lambda expression that can be later applied to a
;;;               string argument for parsing.
;;;;
;;;; Modifications made 6 March 2001 By Chris Double (chris@double.co.nz)
;;;; to get working with Corman Lisp 1.42, add package statement and export
;;;; relevant functions.
;;;;

(in-package :cl-user)

;; Renamed to slime-nregex avoid name clashes with other versions of
;; this file. -- he

;;;; CND - 6/3/2001
(defpackage slime-nregex
  (:use #:common-lisp)
  (:export 
   #:regex
   #:regex-compile
  ))

;;;; CND - 6/3/2001
(in-package :slime-nregex)

;;;
;;; First we create a copy of macros to help debug the beast
(eval-when (:compile-toplevel :load-toplevel :execute)
(defvar *regex-debug* nil)		; Set to nil for no debugging code
)

(defmacro info (message &rest args)
  (if *regex-debug*
      `(format *standard-output* ,message ,@args)))

;;;
;;; Declare the global variables for storing the paren index list.
;;;
(defvar *regex-groups* (make-array 10))
(defvar *regex-groupings* 0)

;;;
;;; Declare a simple interface for testing.  You probably wouldn't want
;;; to use this interface unless you were just calling this once.
;;;
(defun regex (expression string)
  "Usage: (regex <expression> <string)
   This function will call regex-compile on the expression and then apply
   the string to the returned lambda list."
  (let ((findit (cond ((stringp expression)
		       (regex-compile expression))
		      ((listp expression)
		       expression)))
	(result nil))
    (if (not (funcall (if (functionp findit)
			  findit
			(eval `(function ,findit))) string))
	(return-from regex nil))
    (if (= *regex-groupings* 0)
	(return-from regex t))
    (dotimes (i *regex-groupings*)
      (push (funcall 'subseq 
		     string 
		     (car (aref *regex-groups* i))
		     (cadr (aref *regex-groups* i)))
	    result))
    (reverse result)))

;;;
;;; Declare some simple macros to make the code more readable.
;;;
(defvar *regex-special-chars* "?*+.()[]\\${}")

(defmacro add-exp (list)
  "Add an item to the end of expression"
  `(setf expression (append expression ,list)))

;;;
;;; Define a function that will take a quoted character and return
;;; what the real character should be plus how much of the source
;;; string was used.  If the result is a set of characters, return an
;;; array of bits indicating which characters should be set.  If the
;;; expression is one of the sub-group matches return a
;;; list-expression that will provide the match.  
;;;
(defun regex-quoted (char-string &optional (invert nil))
  "Usage: (regex-quoted <char-string> &optional invert)
       Returns either the quoted character or a simple bit vector of bits set for
       the matching values"
  (let ((first (char char-string 0))
	(result (char char-string 0))
	(used-length 1))
    (cond ((eql first #\n)
	   (setf result #\NewLine))
	  ((eql first #\c)
	   (setf result #\Return))
	  ((eql first #\t)
	   (setf result #\Tab))
	  ((eql first #\d)
	   (setf result #*0000000000000000000000000000000000000000000000001111111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
	  ((eql first #\D)
	   (setf result #*1111111111111111111111111111111111111111111111110000000000111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
	  ((eql first #\w)
	   (setf result #*0000000000000000000000000000000000000000000000001111111111000000011111111111111111111111111000010111111111111111111111111110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
	  ((eql first #\W)
	   (setf result #*1111111111111111111111111111111111111111111111110000000000111111100000000000000000000000000111101000000000000000000000000001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
	  ((eql first #\b)
	   (setf result #*0000000001000000000000000000000011000000000010100000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
	  ((eql first #\B)
	   (setf result #*1111111110111111111111111111111100111111111101011111111111011111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
	  ((eql first #\s)
	   (setf result #*0000000001100000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
	  ((eql first #\S)
	   (setf result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
	  ((and (>= (char-code first) (char-code #\0))
		(<= (char-code first) (char-code #\9)))
	   (if (and (> (length char-string) 2)
		    (and (>= (char-code (char char-string 1)) (char-code #\0))
			 (<= (char-code (char char-string 1)) (char-code #\9))
			 (>= (char-code (char char-string 2)) (char-code #\0))
			 (<= (char-code (char char-string 2)) (char-code #\9))))
	       ;;
	       ;; It is a single character specified in octal
	       ;;
	       (progn 
		 (setf result (do ((x 0 (1+ x))
				   (return 0))
				  ((= x 2) return)
				(setf return (+ (* return 8)
						(- (char-code (char char-string x))
						   (char-code #\0))))))
		 (setf used-length 3))
	     ;;
	     ;; We have a group number replacement.
	     ;;
	     (let ((group (- (char-code first) (char-code #\0))))
	       (setf result `((let ((nstring (subseq string (car (aref *regex-groups* ,group))
						     (cadr (aref *regex-groups* ,group)))))
				(if (< length (+ index (length nstring)))
				    (return-from compare nil))
				(if (not (string= string nstring
						  :start1 index
						  :end1 (+ index (length nstring))))
				    (return-from compare nil)
				  (incf index (length nstring)))))))))
	  (t 
	   (setf result first)))
    (if (and (vectorp result) invert)
	(bit-xor result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 t))
    (values result used-length)))

;;;
;;; Now for the main regex compiler routine.
;;;
(defun regex-compile (source &key (anchored nil))
  "Usage: (regex-compile <expression> [ :anchored (t/nil) ])
       This function take a regular expression (supplied as source) and
       compiles this into a lambda list that a string argument can then
       be applied to.  It is also possible to compile this lambda list
       for better performance or to save it as a named function for later
       use"
  (info "Now entering regex-compile with \"~A\"~%" source)
  ;;
  ;; This routine works in two parts.
  ;; The first pass take the regular expression and produces a list of 
  ;; operators and lisp expressions for the entire regular expression.  
  ;; The second pass takes this list and produces the lambda expression.
  (let ((expression '())		; holder for expressions
	(group 1)			; Current group index
	(group-stack nil)		; Stack of current group endings
	(result nil)			; holder for built expression.
	(fast-first nil))		; holder for quick unanchored scan
    ;;
    ;; If the expression was an empty string then it alway
    ;; matches (so lets leave early)
    ;;
    (if (= (length source) 0)
	(return-from regex-compile
		     '(lambda (&rest args)
			(declare (ignore args))
			t)))
    ;;
    ;; If the first character is a caret then set the anchored
    ;; flags and remove if from the expression string.
    ;;
    (cond ((eql (char source 0) #\^)
	   (setf source (subseq source 1))
	   (setf anchored t)))
    ;;
    ;; If the first sequence is .* then also set the anchored flags.
    ;; (This is purely for optimization, it will work without this).
    ;;
    (if (>= (length source) 2)
	(if (string= source ".*" :start1 0 :end1 2)
	    (setf anchored t)))
    ;;
    ;; Also, If this is not an anchored search and the first character is
    ;; a literal, then do a quick scan to see if it is even in the string.
    ;; If not then we can issue a quick nil, 
    ;; otherwise we can start the search at the matching character to skip
    ;; the checks of the non-matching characters anyway.
    ;;
    ;; If I really wanted to speed up this section of code it would be 
    ;; easy to recognize the case of a fairly long multi-character literal
    ;; and generate a Boyer-Moore search for the entire literal. 
    ;;
    ;; I generate the code to do a loop because on CMU Lisp this is about
    ;; twice as fast a calling position.
    ;;
    (if (and (not anchored)
	     (not (position (char source 0) *regex-special-chars*))
	     (not (and (> (length source) 1)
		       (position (char source 1) *regex-special-chars*))))
	(setf fast-first `((if (not (dotimes (i length nil)
				     (if (eql (char string i)
					      ,(char source 0))
					 (return (setf start i)))))
			      (return-from final-return nil)))))
    ;;
    ;; Generate the very first expression to save the starting index
    ;; so that group 0 will be the entire string matched always
    ;;
    (add-exp '((setf (aref *regex-groups* 0)
		     (list index nil))))
    ;;
    ;; Loop over each character in the regular expression building the
    ;; expression list as we go.
    ;;
    (do ((eindex 0 (1+ eindex)))
	((= eindex (length source)))
      (let ((current (char source eindex)))
	(info "Now processing character ~A index = ~A~%" current eindex)
	(case current
	  ((#\.)
	   ;;
	   ;; Generate code for a single wild character
	   ;;
	   (add-exp '((if (>= index length)
			  (return-from compare nil)
			(incf index)))))
	  ((#\$)
	   ;;
	   ;; If this is the last character of the expression then
	   ;; anchor the end of the expression, otherwise let it slide
	   ;; as a standard character (even though it should be quoted).
	   ;;
	   (if (= eindex (1- (length source)))
	       (add-exp '((if (not (= index length))
			      (return-from compare nil))))
	     (add-exp '((if (not (and (< index length)
				      (eql (char string index) #\$)))
			    (return-from compare nil)
			  (incf index))))))
	  ((#\*)
	   (add-exp '(ASTRISK)))

	  ((#\+)
	   (add-exp '(PLUS)))

	  ((#\?)
	   (add-exp '(QUESTION)))

	  ((#\()
	   ;;
	   ;; Start a grouping.
	   ;;
	   (incf group)
	   (push group group-stack)
	   (add-exp `((setf (aref *regex-groups* ,(1- group)) 
			    (list index nil))))
	   (add-exp `(,group)))
	  ((#\))
	   ;;
	   ;; End a grouping
	   ;;
	   (let ((group (pop group-stack)))
	     (add-exp `((setf (cadr (aref *regex-groups* ,(1- group)))
			      index)))
	     (add-exp `(,(- group)))))
	  ((#\[)
	   ;;
	   ;; Start of a range operation.
	   ;; Generate a bit-vector that has one bit per possible character
	   ;; and then on each character or range, set the possible bits.
	   ;;
	   ;; If the first character is carat then invert the set.
	   (let* ((invert (eql (char source (1+ eindex)) #\^))
		  (bitstring (make-array 256 :element-type 'bit
					     :initial-element
					        (if invert 1 0)))
		  (set-char (if invert 0 1)))
	     (if invert (incf eindex))
	     (do ((x (1+ eindex) (1+ x)))
		 ((eql (char source x) #\]) (setf eindex x))
	       (info "Building range with character ~A~%" (char source x))
	       (cond ((and (eql (char source (1+ x)) #\-)
			   (not (eql (char source (+ x 2)) #\])))
		      (if (>= (char-code (char source x))
			     (char-code (char source (+ 2 x))))
			  (error "Invalid range \"~A-~A\".  Ranges must be in acending order"
				 (char source x) (char source (+ 2 x))))
		      (do ((j (char-code (char source x)) (1+ j)))
		       ((> j (char-code (char source (+ 2 x))))
			(incf x 2))
		     (info "Setting bit for char ~A code ~A~%" (code-char j) j)
		     (setf (sbit bitstring j) set-char)))
		     (t
		      (cond ((not (eql (char source x) #\]))
			     (let ((char (char source x)))
			       ;;
			       ;; If the character is quoted then find out what
			       ;; it should have been
			       ;;
			       (if (eql (char source x) #\\ )
				   (let ((length))
				     (multiple-value-setq (char length)
					 (regex-quoted (subseq source x) invert))
				     (incf x length)))
			       (info "Setting bit for char ~A code ~A~%" char (char-code char))
			       (if (not (vectorp char))
				   (setf (sbit bitstring (char-code (char source x))) set-char)
				 (bit-ior bitstring char t))))))))
	     (add-exp `((let ((range ,bitstring))
			  (if (>= index length)
			      (return-from compare nil))
			  (if (= 1 (sbit range (char-code (char string index))))
			      (incf index)
			    (return-from compare nil)))))))
	  ((#\\ )
	   ;;
	   ;; Intreprete the next character as a special, range, octal, group or 
           ;; just the character itself.
	   ;;
	   (let ((length)
		 (value))
	     (multiple-value-setq (value length)
		 (regex-quoted (subseq source (1+ eindex)) nil))
	     (cond ((listp value)
		    (add-exp value))
		   ((characterp value)
		    (add-exp `((if (not (and (< index length)
					     (eql (char string index) 
						  ,value)))
				   (return-from compare nil)
				 (incf index)))))
		   ((vectorp value)
		    (add-exp `((let ((range ,value))
				 (if (>= index length)
				     (return-from compare nil))
				 (if (= 1 (sbit range (char-code (char string index))))
				     (incf index)
				   (return-from compare nil)))))))
	     (incf eindex length)))
	  (t
	   ;;
	   ;; We have a literal character.  
	   ;; Scan to see how many we have and if it is more than one
	   ;; generate a string= verses as single eql.
	   ;;
	   (let* ((lit "")
		  (term (dotimes (litindex (- (length source) eindex) nil)
			  (let ((litchar (char source (+ eindex litindex))))
			    (if (position litchar *regex-special-chars*)
				(return litchar)
			      (progn
				(info "Now adding ~A index ~A to lit~%" litchar 
				      litindex)
				(setf lit (concatenate 'string lit 
						       (string litchar)))))))))
	     (if (= (length lit) 1)
		 (add-exp `((if (not (and (< index length)
					  (eql (char string index) ,current)))
				(return-from compare nil)
			      (incf index))))
	       ;;
	       ;; If we have a multi-character literal then we must
	       ;; check to see if the next character (if there is one)
	       ;; is an astrisk or a plus or a question mark.  If so then we must not use this
	       ;; character in the big literal.
	       (progn 
		 (if (or (eql term #\*)
                         (eql term #\+)
                         (eql term #\?))
		     (setf lit (subseq lit 0 (1- (length lit)))))
		 (add-exp `((if (< length (+ index ,(length lit)))
				(return-from compare nil))
			    (if (not (string= string ,lit :start1 index
					      :end1 (+ index ,(length lit))))
				(return-from compare nil)
			      (incf index ,(length lit)))))))
	     (incf eindex (1- (length lit))))))))
    ;;
    ;; Plug end of list to return t.  If we made it this far then
    ;; We have matched!
    (add-exp '((setf (cadr (aref *regex-groups* 0))
		     index)))
    (add-exp '((return-from final-return t)))
    ;;
;;;    (print expression)
    ;;
    ;; Now take the expression list and turn it into a lambda expression
    ;; replacing the special flags with lisp code.
    ;; For example:  A BEGIN needs to be replace by an expression that
    ;; saves the current index, then evaluates everything till it gets to
    ;; the END then save the new index if it didn't fail.
    ;; On an ASTRISK I need to take the previous expression and wrap
    ;; it in a do that will evaluate the expression till an error
    ;; occurs and then another do that encompases the remainder of the
    ;; regular expression and iterates decrementing the index by one
    ;; of the matched expression sizes and then returns nil.  After
    ;; the last expression insert a form that does a return t so that
    ;; if the entire nested sub-expression succeeds then the loop
    ;; is broken manually.
    ;; 
    (setf result (copy-tree nil))
    ;;
    ;; Reversing the current expression makes building up the 
    ;; lambda list easier due to the nexting of expressions when 
    ;; and astrisk has been encountered.
    (setf expression (reverse expression))
    (do ((elt 0 (1+ elt)))
	((>= elt (length expression)))
      (let ((piece (nth elt expression)))
	;;
	;; Now check for PLUS, if so then ditto the expression and then let the
	;; ASTRISK below handle the rest.
	;;
	(cond ((eql piece 'PLUS)
	       (cond ((listp (nth (1+ elt) expression))
		      (setf result (append (list (nth (1+ elt) expression))
					   result)))
		     ;;
		     ;; duplicate the entire group
		     ;; NOTE: This hasn't been implemented yet!!
		     (t
		      (error "GROUP repeat hasn't been implemented yet~%")))))
	(cond ((listp piece)		;Just append the list
	       (setf result (append (list piece) result)))
	      ((eql piece 'QUESTION)	; Wrap it in a block that won't fail
	       (cond ((listp (nth (1+ elt) expression))
		      (setf result 
			    (append `((progn (block compare
						    ,(nth (1+ elt) 
							  expression))
					     t))
				    result))
		      (incf elt))
		     ;;
		     ;; This is a QUESTION on an entire group which
		     ;; hasn't been implemented yet!!!
		     ;;
		     (t
		      (error "Optional groups not implemented yet~%"))))
	      ((or (eql piece 'ASTRISK) ; Do the wild thing!
		   (eql piece 'PLUS))
	       (cond ((listp (nth (1+ elt) expression))
		      ;;
		      ;; This is a single character wild card so
		      ;; do the simple form.
		      ;;
		      (setf result 
			    `((let ((oindex index))
				(block compare
				       (do ()
					   (nil)
					 ,(nth (1+ elt) expression)))
				(do ((start index (1- start)))
				    ((< start oindex) nil)
				  (let ((index start))
				    (block compare
					   ,@result))))))
		      (incf elt))
		     (t
		      ;;
		      ;; This is a subgroup repeated so I must build
		      ;; the loop using several values.
		      ;;
		      ))
	       )
	      (t t))))			; Just ignore everything else.
    ;;
    ;; Now wrap the result in a lambda list that can then be 
    ;; invoked or compiled, however the user wishes.
    ;;
    (if anchored
	(setf result
	      `(lambda (string &key (start 0) (end (length string)))
		 (setf *regex-groupings* ,group)
		 (block final-return
			(block compare
			       (let ((index start)
				     (length end))
				 ,@result)))))
      (setf result
	    `(lambda (string &key (start 0) (end (length string)))
	       (setf *regex-groupings* ,group)
	       (block final-return
		      (let ((length end))
			,@fast-first
			(do ((marker start (1+ marker)))
			    ((> marker end) nil)
			  (let ((index marker))
			    (if (block compare
				       ,@result)
				(return t)))))))))))

;; (provide 'nregex)