From: plugd Date: Sun, 5 Nov 2023 23:09:41 +0000 (+0100) Subject: Work on tokenizing, but outa energy. X-Git-Url: https://thelambdalab.xyz/gitweb/index.cgi?a=commitdiff_plain;h=6e55cb573c7ac085cddca49a5834af4f71aea4a7;p=ez.git Work on tokenizing, but outa energy. --- diff --git a/ez.el b/ez.el index f4f698e..a9d63e3 100644 --- a/ez.el +++ b/ez.el @@ -96,6 +96,10 @@ (defun ez-mem-set-byte (addr val) (aset ez-memory addr val)) +(defun ez-mem-set-bytes (addr vals) + (dotimes (i (length vals)) + (ez-mem-set-byte (+ addr i) (elt vals i)))) + (defun ez-mem-ref-word (addr) (+ (* 256 (aref ez-memory addr)) (aref ez-memory (+ addr 1)))) @@ -106,6 +110,9 @@ (aset ez-memory addr byte-high) (aset ez-memory (+ addr 1) byte-low))) +(defun ez-mem-ref-string (addr1 addr2) + (substring ez-memory addr1 addr2)) + (defun ez-parse-header () (setq ez-version (ez-mem-ref-byte #x0)) (setq ez-himem-addr (ez-mem-ref-word #x4)) @@ -348,7 +355,7 @@ Used for matching input with dictionary entries when tokenizing." ;; Dictionary -(defun ez-parse-dictionary () +(defun ez-parse-dictionary-header () (let* ((nseps (ez-mem-ref-byte ez-dict-base)) (separators (mapcar (lambda (i) (ez-mem-ref-byte (+ ez-dict-base i))) @@ -357,14 +364,41 @@ Used for matching input with dictionary entries when tokenizing." (nentries (ez-mem-ref-word (+ ez-dict-base 2 nseps))) (entries-base (+ ez-dict-base nseps 4)) (entries nil)) - (dotimes (i nentries) - (let ((this-base (+ entries-base (* 7 i)))) - (setq entries (cons (cons (ez-get-zstring this-base) - this-base) - entries)))) - (setq ez-dict-entries (reverse entries)) + ;; (dotimes (i nentries) + ;; (let ((this-base (+ entries-base (* bytes-per-entry i)))) + ;; (setq entries (cons (cons (ez-get-zstring this-base) + ;; this-base) + ;; entries)))) + ;; (setq ez-dict-entries (reverse entries)) (setq ez-dict-separators separators))) +(defun ez-is-separator (char) + (let* ((nseps (ez-mem-ref-byte ez-dict-base))) + (while (and (> nseps 0) + (not (= (ez-mem-ref-byte (+ ez-dict-base nseps)) + char))) + (setq nseps (- nseps 1))) + (> nseps 0))) + +(defun ez-lookup-dictionary (text) + (let ((encoded-text (ez-encode text)) + (nseps (ez-mem-ref-byte ez-dict-base)) + (bytes-per-entry (ez-mem-ref-byte (+ ez-dict-base 1 nseps))) + (nentries (ez-mem-ref-word (+ ez-dict-base 2 nseps))) + (entries-seen 0) + (this-entry (+ ez-dict-base nseps 4))) + + (while (and (< entries-seen nentries) + (not (equal + (ez-mem-ref-bytes this-entry 4) + encoded-text))) + (setq entries-seen (+ entries-seen 1)) + (setq this-entry (+ this-entry bytes-per-entry))) + + (if (< entries-seen nentries) + this-entry + 0))) + ;; Call stack (defvar ez-call-stack nil) @@ -895,31 +929,46 @@ Used for matching input with dictionary entries when tokenizing." (defun ez-op-read2 (input-string) (let* ((baddr1 (car ez--next-read-args)) - (baddr2 (cadr ez--next-read-args)) - (dict (ez-get-dictionary)) - (separators (car dict)) - (wordlist (cdr dict)) - (token-start 0)) + (baddr2 (cadr ez--next-read-args))) (dotimes (i (length input-string)) (let ((char (elt input-string i))) - (ez-mem-set-byte (+ baddr1 1 i) char) + (ez-mem-set-byte (+ baddr1 1 i) char))) (ez-mem-set-byte (+ baddr1 1 (length input-string)) 0) - (ez--tokenize baddr1 baddr2))))) + (ez--tokenize baddr1 baddr2))) -(defun ez--tokenize (taddr baddr) +(defun ez--tokenize (tb-baddr pb-baddr) (let ((unfinished t) (token-start 0) (token-end 0) - (token-string "")) + (token-count 0)) + (while unfinished - (let ((char (ez-mem-ref-byte (+ taddr 1 token-end)))) + (let ((next-char (ez-mem-ref-byte (+ tb-baddr 1 token-end)))) (cond - ((eq char ?\s)) - ((memq char ez-dict-separators)) - ) + ((eq char ?\s) + ;; Add token + (setq token-end (- token-end 1)) + (let* ((text (ez-mem-ref-string (+ tb-baddr 1 token-start) + (+ tb-baddr 1 token-end))) + (dict-entry (ez-lookup-dictionary text))) + (setq token-count (+ token-count 1)) + (ez-mem-set-word (+ pb-baddr 2 (* token-count 4)) + dict-entry) + (ez-mem-set-bytes (+ pb-baddr 2 (* token-count 4) 2) + (length text) + token-start)) + (setq token-start (+ token-end 1)) + (setq token-end token-start)) + + ((ez-is-separator char) + ;; Add token and separator token + ) + ((eq char 0) + (setq unfinished nil)) + (setq token-end (+ token-end 1))) )))) ;; Execution loop