;;; dom.el --- XML/HTML (etc.) DOM manipulation and searching functions -*- lexical-binding: t -*-
;; Copyright (C) 2014-2025 Free Software Foundation, Inc.
;; Author: Lars Magne Ingebrigtsen
;; Keywords: xml, html
;; This file is part of GNU Emacs.
;; GNU Emacs is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs. If not, see .
;;; Commentary:
;;; Code:
(require 'cl-lib)
(eval-when-compile (require 'subr-x))
(defsubst dom-tag (node)
"Return the NODE tag."
;; Called on a list of nodes. Use the first.
(car (if (consp (car node)) (car node) node)))
(defsubst dom-attributes (node)
"Return the NODE attributes."
;; Called on a list of nodes. Use the first.
(cadr (if (consp (car node)) (car node) node)))
(defsubst dom-children (node)
"Return the NODE children."
;; Called on a list of nodes. Use the first.
(cddr (if (consp (car node)) (car node) node)))
(defun dom-non-text-children (node)
"Return all non-text-node children of NODE."
(cl-loop for child in (dom-children node)
unless (stringp child)
collect child))
(defun dom-set-attributes (node attributes)
"Set the attributes of NODE to ATTRIBUTES."
(setq node (dom-ensure-node node))
(setcar (cdr node) attributes))
(defun dom-set-attribute (node attribute value)
"Set ATTRIBUTE in NODE to VALUE."
(setq node (dom-ensure-node node))
(let* ((attributes (cadr node))
(old (assoc attribute attributes)))
(if old
(setcdr old value)
(setcar (cdr node) (cons (cons attribute value) attributes)))))
(defun dom-remove-attribute (node attribute)
"Remove ATTRIBUTE from NODE."
(setq node (dom-ensure-node node))
(when-let* ((old (assoc attribute (cadr node))))
(setcar (cdr node) (delq old (cadr node)))))
(defmacro dom-attr (node attr)
"Return the attribute ATTR from NODE.
A typical attribute is `href'."
`(cdr (assq ,attr (dom-attributes ,node))))
(defun dom-text (node)
"Return all the text bits in the current node concatenated."
(mapconcat #'identity (cl-remove-if-not #'stringp (dom-children node)) " "))
(defun dom-texts (node &optional separator)
"Return all textual data under NODE concatenated with SEPARATOR in-between."
(if (eq (dom-tag node) 'script)
""
(mapconcat
(lambda (elem)
(cond
((stringp elem)
elem)
((eq (dom-tag elem) 'script)
"")
(t
(dom-texts elem separator))))
(dom-children node)
(or separator " "))))
(defun dom-child-by-tag (dom tag)
"Return the first child of DOM that is of type TAG."
(assoc tag (dom-children dom)))
(defun dom-by-tag (dom tag)
"Return elements in DOM that is of type TAG.
A name is a symbol like `td'."
(let ((matches (cl-loop for child in (dom-children dom)
for matches = (and (not (stringp child))
(dom-by-tag child tag))
when matches
append matches)))
(if (equal (dom-tag dom) tag)
(cons dom matches)
matches)))
(defun dom-search (dom predicate)
"Return elements in DOM where PREDICATE is non-nil.
PREDICATE is called with the node as its only parameter."
(let ((matches (cl-loop for child in (dom-children dom)
for matches = (and (not (stringp child))
(dom-search child predicate))
when matches
append matches)))
(if (funcall predicate dom)
(cons dom matches)
matches)))
(defun dom-strings (dom)
"Return elements in DOM that are strings."
(cl-loop for child in (dom-children dom)
if (stringp child)
collect child
else
append (dom-strings child)))
(defun dom-by-class (dom match)
"Return elements in DOM that have a class name that matches regexp MATCH."
(dom-elements dom 'class match))
(defun dom-by-style (dom match)
"Return elements in DOM that have a style that matches regexp MATCH."
(dom-elements dom 'style match))
(defun dom-by-id (dom match)
"Return elements in DOM that have an ID that matches regexp MATCH."
(dom-elements dom 'id match))
(defun dom-elements (dom attribute match)
"Find elements matching MATCH (a regexp) in ATTRIBUTE.
ATTRIBUTE would typically be `class', `id' or the like."
(let ((matches (cl-loop for child in (dom-children dom)
for matches = (and (not (stringp child))
(dom-elements child attribute
match))
when matches
append matches))
(attr (dom-attr dom attribute)))
(if (and attr
(string-match match attr))
(cons dom matches)
matches)))
(defun dom-remove-node (dom node)
"Remove NODE from DOM."
;; If we're removing the top level node, just return nil.
(dolist (child (dom-children dom))
(cond
((eq node child)
(delq node dom))
((not (stringp child))
(dom-remove-node child node)))))
(defun dom-parent (dom node)
"Return the parent of NODE in DOM."
(if (memq node (dom-children dom))
dom
(let ((result nil))
(dolist (elem (dom-children dom))
(when (and (not result)
(not (stringp elem)))
(setq result (dom-parent elem node))))
result)))
(defun dom-previous-sibling (dom node)
"Return the previous sibling of NODE in DOM."
(when-let* ((parent (dom-parent dom node)))
(let ((siblings (dom-children parent))
(previous nil))
(while siblings
(when (eq (cadr siblings) node)
(setq previous (car siblings)))
(pop siblings))
previous)))
(defun dom-node (tag &optional attributes &rest children)
"Return a DOM node with TAG and ATTRIBUTES."
`(,tag ,attributes ,@children))
(defun dom-append-child (node child)
"Append CHILD to the end of NODE's children."
(setq node (dom-ensure-node node))
(nconc node (list child)))
(defun dom-add-child-before (node child &optional before)
"Add CHILD to NODE's children before child BEFORE.
If BEFORE is nil, make CHILD NODE's first child."
(setq node (dom-ensure-node node))
(let ((children (dom-children node)))
(when (and before
(not (memq before children)))
(error "%s does not exist as a child" before))
(let ((pos (if before
(cl-position before children)
0)))
(push child (nthcdr (+ 2 pos) node))))
node)
(defun dom-ensure-node (node)
"Ensure that NODE is a proper DOM node."
;; Add empty attributes, if none.
(when (consp (car node))
(setq node (car node)))
(when (= (length node) 1)
(setcdr node (list nil)))
node)
(defun dom-pp (dom &optional remove-empty)
"Pretty-print DOM at point.
If REMOVE-EMPTY, ignore textual nodes that contain just
white-space."
(let ((column (current-column)))
(insert (format "(%S " (dom-tag dom)))
(let* ((attr (dom-attributes dom))
(times (length attr))
(column (1+ (current-column))))
(if (null attr)
(insert "nil")
(insert "(")
(dolist (elem attr)
(insert (format "(%S . %S)" (car elem) (cdr elem)))
(if (zerop (decf times))
(insert ")")
(insert "\n" (make-string column ?\s))))))
(let* ((children (if remove-empty
(cl-remove-if
(lambda (child)
(and (stringp child)
(string-match "\\`[\n\r\t ]*\\'" child)))
(dom-children dom))
(dom-children dom)))
(times (length children)))
(if (null children)
(insert ")")
(insert "\n" (make-string (1+ column) ?\s))
(dolist (child children)
(if (stringp child)
(if (not (and remove-empty
(string-match "\\`[\n\r\t ]*\\'" child)))
(insert (format "%S" child)))
(dom-pp child remove-empty))
(if (zerop (decf times))
(insert ")")
(insert "\n" (make-string (1+ column) ?\s))))))))
(define-inline dom--html-boolean-attribute-p (attr)
"Return non-nil if ATTR is an HTML boolean attribute."
(inline-quote
(memq ,attr
;; Extracted from the HTML Living Standard list of attributes
;; at .
'( allowfullscreen alpha async autofocus autoplay checked
controls default defer disabled formnovalidate inert ismap
itemscope loop multiple muted nomodule novalidate open
playsinline readonly required reversed selected
shadowrootclonable shadowrootdelegatesfocus
shadowrootserializable))))
(defun dom-print (dom &optional pretty xml)
"Print DOM at point as HTML/XML.
If PRETTY, indent the HTML/XML logically.
If XML, generate XML instead of HTML."
(let ((column (current-column))
(indent-tabs-mode nil)) ;; Indent with spaces
(insert (format "<%s" (dom-tag dom)))
(pcase-dolist (`(,attr . ,value) (dom-attributes dom))
;; Don't print attributes without a value.
(when value
(insert
;; HTML boolean attributes should not have an = value. The
;; presence of a boolean attribute on an element represents
;; the true value, and the absence of the attribute
;; represents the false value.
(if (and (not xml) (dom--html-boolean-attribute-p attr))
(format " %s" attr)
(format " %s=%S" attr (url-insert-entities-in-string
(format "%s" value)))))))
(let* ((children (dom-children dom))
(non-text nil)
(indent (+ column 2)))
(if (null children)
(insert " />")
(insert ">")
(dolist (child children)
(if (stringp child)
(insert (url-insert-entities-in-string child))
(setq non-text t)
(when pretty
(insert "\n")
(indent-line-to indent))
(dom-print child pretty xml)))
;; If we inserted non-text child nodes, or a text node that
;; ends with a newline, then we indent the end tag.
(when (and pretty (or (bolp) non-text))
(or (bolp) (insert "\n"))
(indent-line-to column))
(insert (format "%s>" (dom-tag dom)))))))
(provide 'dom)
;;; dom.el ends here