From 3d49ad73e5a93625629c96b6c0b921bb019ea9da Mon Sep 17 00:00:00 2001
From: Harald Jörg <haj@posteo.de>
Date: Tue, 14 Sep 2021 17:53:52 +0200
Subject: cperl-mode.el: Allow non-ASCII Perl identifiers

Replace all "A-Z" regexp literals with unicode-aware rx constructs
wherever Perl allows non-ASCII identifiers.
* lisp/progmodes/cperl-mode.el (cperl-after-sub-regexp)
(cperl-after-label. cperl-sniff-for-indent)
(cperl-find-pods-heres, cperl-indent-exp)
(cperl-fix-line-spacing, cperl-imenu--create-perl-index)
(cperl-init-faces, cperl-find-tags):
Replace ASCII regex literals by unicode-aware rx constructs.
(cperl-init-faces): Eliminate unused lexical `font-lock-anchored'.
(cperl-have-help-regexp, cperl-word-at-point-hard): Allow non-ASCII
word characters.

* test/lisp/progmodes/cperl-mode-tests.el
(cperl-test-fontify-special-variables): New test for $^T
and $^{VARNAME}.
(cperl-test-ws-rx cperl-test-ws+-rx),
(cperl-test-version-regexp, cperl-test-package-regexp): Skip
for perl-mode.
(cperl-test-identifier-rx, cperl--test-unicode-setup)
(cperl-test-unicode-labels, cperl-test-unicode-sub)
(cperl-test-unicode-varname)
(cperl-test-unicode-varname-list, cperl-test-unicode-arrays)
(cperl-test-unicode-hashes, cperl-test-unicode-hashref)
(cperl-test-unicode-proto, cperl-test-unicode-fhs)
(cperl-test-unicode-hashkeys, cperl-test-word-at-point):
New tests for unicode identifiers.
 (cperl-test-imenu-index): Add a unicode identifier to the test.

* test/lisp/progmodes/cperl-mode-resources/grammar.pl: Add a
function with non-ASCII name for imenu tests.
---
 lisp/progmodes/cperl-mode.el | 330 ++++++++++++++++++++++++++++++-------------
 1 file changed, 231 insertions(+), 99 deletions(-)

(limited to 'lisp/progmodes/cperl-mode.el')

diff --git a/lisp/progmodes/cperl-mode.el b/lisp/progmodes/cperl-mode.el
index 76c82f8c73e..1147889969b 100644
--- a/lisp/progmodes/cperl-mode.el
+++ b/lisp/progmodes/cperl-mode.el
@@ -1407,7 +1407,7 @@ the last)."
   (concat				; Assume n groups before this...
    "\\("				; n+1=name-group
      cperl-white-and-comment-rex	; n+2=pre-name
-     "\\(::[a-zA-Z_0-9:']+\\|[a-zA-Z_'][a-zA-Z_0-9:']*\\)" ; n+3=name
+     (rx-to-string `(group ,cperl--normal-identifier-rx))
    "\\)"				; END n+1=name-group
    (if named "" "?")
    "\\("				; n+4=proto-group
@@ -2573,7 +2573,8 @@ Return the amount the indentation changed by."
 	     '(?w ?_))
        (progn
 	 (backward-sexp)
-	 (looking-at "[a-zA-Z_][a-zA-Z0-9_]*:[^:]"))))
+         (looking-at (rx (sequence (eval cperl--label-rx)
+                                   (not (in ":"))))))))
 
 (defun cperl-get-state (&optional parse-start start-state)
   "Return list (START STATE DEPTH PRESTART),
@@ -2740,7 +2741,9 @@ Will not look before LIM."
 				  (progn
 				    (forward-sexp -1)
 				    (skip-chars-backward " \t")
-				    (looking-at "[ \t]*[a-zA-Z_][a-zA-Z_0-9]*[ \t]*:")))
+				    (looking-at
+                                     (rx (sequence (0+ blank)
+                                                   (eval cperl--label-rx))))))
 			     (get-text-property (point) 'first-format-line)))
 
 		   ;; Look at previous line that's at column 0
@@ -3836,7 +3839,8 @@ recursive calls in starting lines of here-documents."
 		"\\<" cperl-sub-regexp "\\>" ;  sub with proto/attr
 		"\\("
 		   cperl-white-and-comment-rex
-		   "\\(::[a-zA-Z_:'0-9]*\\|[a-zA-Z_'][a-zA-Z_:'0-9]*\\)\\)?" ; name
+                   (rx (group (eval cperl--normal-identifier-rx)))
+                "\\)"
 		"\\("
 		   cperl-maybe-white-and-comment-rex
 		   "\\(([^()]*)\\|:[^:]\\)\\)" ; prototype or attribute start
@@ -4111,10 +4115,12 @@ recursive calls in starting lines of here-documents."
 					 (t t))))
 			   ;; <file> or <$file>
 			   (and (eq c ?\<)
-				;; Do not stringify <FH>, <$fh> :
+                                ;; Stringify what looks like a glob, but
+				;; do not stringify file handles <FH>, <$fh> :
 				(save-match-data
 				  (looking-at
-				   "\\$?\\([_a-zA-Z:][_a-zA-Z0-9:]*\\)?>"))))
+                                   (rx (sequence (opt "$")
+                                                 (eval cperl--normal-identifier-rx)))))))
 		      tb (match-beginning 0))
 		(goto-char (match-beginning b1))
 		(cperl-backward-to-noncomment (point-min))
@@ -4184,7 +4190,16 @@ recursive calls in starting lines of here-documents."
 				 (error nil)))
 			     (if (or bb
 				     (looking-at ; $foo -> {s}
-				      "[$@]\\$*\\([a-zA-Z0-9_:]+\\|[^{]\\)\\([ \t\n]*->\\)?[ \t\n]*{")
+                                      (rx
+                                       (sequence
+                                        (in "$@") (0+ "$")
+                                        (or
+                                         (eval cperl--normal-identifier-rx)
+                                         (not (in "{")))
+                                        (opt (sequence (eval cperl--ws*-rx))
+                                             "->")
+                                        (eval cperl--ws*-rx)
+                                        "{")))
 				     (and ; $foo[12] -> {s}
 				      (memq (following-char) '(?\{ ?\[))
 				      (progn
@@ -4199,7 +4214,12 @@ recursive calls in starting lines of here-documents."
 			     (setq bb t))
 			    ((and (eq (following-char) ?:)
 				  (eq b1 ?\{) ; Check for $ { s::bar }
-				  (looking-at "::[a-zA-Z0-9_:]*[ \t\n\f]*}")
+				  ;;  (looking-at "::[a-zA-Z0-9_:]*[ \t\n\f]*}")
+                                  (looking-at
+                                   (rx (sequence "::"
+                                                 (eval cperl--normal-identifier-rx)
+                                                 (eval cperl--ws*-rx)
+                                                 "}")))
 				  (progn
 				    (goto-char (1- go))
 				    (skip-chars-backward " \t\n\f")
@@ -4364,7 +4384,7 @@ recursive calls in starting lines of here-documents."
 				    "\\(" ;; XXXX 1-char variables, exc. |()\s
 				       "[$@]"
 				       "\\("
-				          "[_a-zA-Z:][_a-zA-Z0-9:]*"
+                                          (rx (eval cperl--normal-identifier-rx))
 				       "\\|"
 				          "{[^{}]*}" ; only one-level allowed
 				       "\\|"
@@ -4820,6 +4840,7 @@ recursive calls in starting lines of here-documents."
 	   (progn
 	     (backward-sexp)
 	     ;; sub {BLK}, print {BLK} $data, but NOT `bless', `return', `tr', `constant'
+             ;; a-zA-Z is fine here, these are Perl keywords
 	     (or (and (looking-at "[a-zA-Z0-9_:]+[ \t\n\f]*[{#]") ; Method call syntax
 		      (not (looking-at "\\(bless\\|return\\|q[wqrx]?\\|tr\\|[smy]\\|constant\\)\\>")))
 		 ;; sub bless::foo {}
@@ -5028,7 +5049,11 @@ conditional/loop constructs."
 				     cperl-maybe-white-and-comment-rex
 				     "\\(state\\|my\\|local\\|our\\)\\)?"
 				     cperl-maybe-white-and-comment-rex
-				     "\\$[_a-zA-Z0-9]+\\)?\\)\\>"))
+                                     (rx
+                                      (sequence
+                                       "$"
+                                       (eval cperl--basic-identifier-rx)))
+				     "\\)?\\)\\>"))
 			    (progn
 			      (goto-char top)
 			      (forward-sexp 1)
@@ -5122,7 +5147,14 @@ Returns some position at the last line."
 	;; Looking at:
 	;; foreach my $var     (
 	(if (looking-at
-	     "[ \t]*\\<for\\(each\\)?[ \t]+\\(state\\|my\\|local\\|our\\)[ \t]*\\$[_a-zA-Z0-9]+\\(\t*\\|[ \t][ \t]+\\)[^ \t\n#]")
+             (rx (sequence (0+ blank) symbol-start
+                           "for" (opt "each")
+                           (1+ blank)
+                           (or "state" "my" "local" "our")
+                           (0+ blank)
+                           "$" (eval cperl--basic-identifier-rx)
+                           (1+ blank)
+                           (not (in " \t\n#")))))
 	    (progn
 	      (forward-sexp 3)
 	      (delete-horizontal-space)
@@ -5132,9 +5164,25 @@ Returns some position at the last line."
 	;; Looking at (with or without "}" at start, ending after "({"):
 	;; } foreach my $var ()         OR   {
 	(if (looking-at
-	     "[ \t]*\\(}[ \t]*\\)?\\<\\(els\\(e\\|if\\)\\|continue\\|if\\|unless\\|while\\|for\\(each\\)?\\(\\([ \t]+\\(state\\|my\\|local\\|our\\)\\)?[ \t]*\\$[_a-zA-Z0-9]+\\)?\\|until\\)\\>\\([ \t]*(\\|[ \t\n]*{\\)\\|[ \t]*{")
+             (rx (sequence
+                  (0+ blank)
+                  (opt (sequence "}" (0+ blank) ))
+                  symbol-start
+                  (or "else" "elsif" "continue" "if" "unless" "while" "until"
+                      (sequence (or "for" "foreach")
+                                (opt
+                                 (opt (sequence (1+ blank)
+                                                (or "state" "my" "local" "our")))
+                                 (0+ blank)
+                                 "$" (eval cperl--basic-identifier-rx))))
+                  symbol-end
+                  (group-n 1
+                           (or
+                            (or (sequence (0+ blank) "(")
+                                (sequence (eval cperl--ws*-rx) "{"))
+                            (sequence (0+ blank) "{"))))))
 	    (progn
-	      (setq ml (match-beginning 8)) ; "(" or "{" after control word
+	      (setq ml (match-beginning 1)) ; "(" or "{" after control word
 	      (re-search-forward "[({]")
 	      (forward-char -1)
 	      (setq p (point))
@@ -5544,7 +5592,11 @@ comment, or POD."
 	   (setq lst index-sub-alist)
 	   (while lst
 	     (setq elt (car lst) lst (cdr lst))
-	     (cond ((string-match "\\(::\\|'\\)[_a-zA-Z0-9]+$" (car elt))
+	     (cond ((string-match
+                     (rx (sequence (or "::" "'")
+                                   (eval cperl--basic-identifier-rx)
+                                   string-end))
+                     (car elt))
 		    (setq pack (substring (car elt) 0 (match-beginning 0)))
 		    (if (setq group (assoc pack hier-list))
 			(if (listp (cdr group))
@@ -5646,8 +5698,7 @@ default function."
 (defun cperl-init-faces ()
   (condition-case errs
       (progn
-	(let (t-font-lock-keywords t-font-lock-keywords-1 font-lock-anchored)
-	  (setq font-lock-anchored t)
+	(let (t-font-lock-keywords t-font-lock-keywords-1)
 	  (setq
 	   t-font-lock-keywords
 	   (list
@@ -5760,20 +5811,41 @@ default function."
 			 (if (eq (char-after (cperl-1- (match-end 0))) ?\{ )
 			     'font-lock-function-name-face
 			   'font-lock-variable-name-face))))
-	    '("\\<\\(package\\|require\\|use\\|import\\|no\\|bootstrap\\)[ \t]+\\([a-zA-Z_][a-zA-Z_0-9:]*\\)[ \t;]" ; require A if B;
-	      2 font-lock-function-name-face)
+            `(,(rx (sequence symbol-start
+                             (or "package" "require" "use" "import"
+                                 "no" "bootstrap")
+                             (eval cperl--ws+-rx)
+                             (group-n 1 (eval cperl--normal-identifier-rx))
+                             (any " \t;"))) ; require A if B;
+	      1 font-lock-function-name-face)
 	    '("^[ \t]*format[ \t]+\\([a-zA-Z_][a-zA-Z_0-9:]*\\)[ \t]*=[ \t]*$"
 	      1 font-lock-function-name-face)
-	    (cond (font-lock-anchored
-		   '("\\([]}\\%@>*&]\\|\\$[a-zA-Z0-9_:]*\\)[ \t]*{[ \t]*\\(-?[a-zA-Z0-9_:]+\\)[ \t]*}"
-		     (2 font-lock-string-face t)
-		     ("\\=[ \t]*{[ \t]*\\(-?[a-zA-Z0-9_:]+\\)[ \t]*}"
-		      nil nil
-		      (1 font-lock-string-face t))))
-		  (t '("\\([]}\\%@>*&]\\|\\$[a-zA-Z0-9_:]*\\)[ \t]*{[ \t]*\\(-?[a-zA-Z0-9_:]+\\)[ \t]*}"
-		       2 font-lock-string-face t)))
-	    '("[[ \t{,(]\\(-?[a-zA-Z0-9_:]+\\)[ \t]*=>" 1
-	      font-lock-string-face t)
+            ;; bareword hash key: $foo{bar}
+            `(,(rx (or (in "]}\\%@>*&") ; What Perl is this?
+                       (sequence "$" (eval cperl--normal-identifier-rx)))
+                   (0+ blank) "{" (0+ blank)
+                   (group-n 1 (sequence (opt "-")
+                                        (eval cperl--basic-identifier-rx)))
+                   (0+ blank) "}")
+;;	    '("\\([]}\\%@>*&]\\|\\$[a-zA-Z0-9_:]*\\)[ \t]*{[ \t]*\\(-?[a-zA-Z0-9_:]+\\)[ \t]*}"
+	      (1 font-lock-string-face t)
+              ;; anchored bareword hash key: $foo{bar}{baz}
+              (,(rx point
+                   (0+ blank) "{" (0+ blank)
+                   (group-n 1 (sequence (opt "-")
+                                        (eval cperl--basic-identifier-rx)))
+                   (0+ blank) "}")
+	      ;; ("\\=[ \t]*{[ \t]*\\(-?[a-zA-Z0-9_:]+\\)[ \t]*}"
+	       nil nil
+	       (1 font-lock-string-face t)))
+              ;; hash element assignments with bareword key => value
+              `(,(rx (in "[ \t{,()")
+                     (group-n 1 (sequence (opt "-")
+                                          (eval cperl--basic-identifier-rx)))
+                     (0+ blank) "=>")
+                1 font-lock-string-face t)
+;;	    '("[[ \t{,(]\\(-?[a-zA-Z0-9_:]+\\)[ \t]*=>" 1
+;;	      font-lock-string-face t)
             ;; labels
             `(,(rx
                 (sequence
@@ -5797,83 +5869,130 @@ default function."
             ;;; '("[$*]{?\\(\\sw+\\)" 1 font-lock-variable-name-face)
             ;;; '("\\([@%]\\|\\$#\\)\\(\\sw+\\)"
             ;;;  (2 (cons font-lock-variable-name-face '(underline))))
-	    (cond (font-lock-anchored
 		   ;; 1=my_etc, 2=white? 3=(+white? 4=white? 5=var
-		   `(,(concat "\\<\\(state\\|my\\|local\\|our\\)"
-				  cperl-maybe-white-and-comment-rex
-				  "\\(("
-				     cperl-maybe-white-and-comment-rex
-				  "\\)?\\([$@%*]\\([a-zA-Z0-9_:]+\\|[^a-zA-Z0-9_]\\)\\)")
-		       (5 ,(if cperl-font-lock-multiline
-				 'font-lock-variable-name-face
-			       '(progn  (setq cperl-font-lock-multiline-start
-					      (match-beginning 0))
-					'font-lock-variable-name-face)))
-		       (,(concat "\\="
-				   cperl-maybe-white-and-comment-rex
-				   ","
-				   cperl-maybe-white-and-comment-rex
-				   "\\([$@%*]\\([a-zA-Z0-9_:]+\\|[^a-zA-Z0-9_]\\)\\)")
-			;; Bug in font-lock: limit is used not only to limit
-			;; searches, but to set the "extend window for
-			;; facification" property.  Thus we need to minimize.
-			,(if cperl-font-lock-multiline
-			     '(if (match-beginning 3)
-				  (save-excursion
-				    (goto-char (match-beginning 3))
-				    (condition-case nil
-					(forward-sexp 1)
-				      (error
-				       (condition-case nil
-					   (forward-char 200)
-					 (error nil)))) ; typeahead
-				    (1- (point))) ; report limit
-				(forward-char -2)) ; disable continued expr
-			     '(if (match-beginning 3)
-				  (point-max) ; No limit for continuation
-				(forward-char -2))) ; disable continued expr
-			,(if cperl-font-lock-multiline
-			       nil
-			     '(progn	; Do at end
-				;; "my" may be already fontified (POD),
-				;; so cperl-font-lock-multiline-start is nil
-				(if (or (not cperl-font-lock-multiline-start)
-					(> 2 (count-lines
-					      cperl-font-lock-multiline-start
-					      (point))))
-				    nil
-				  (put-text-property
-				   (1+ cperl-font-lock-multiline-start) (point)
-				   'syntax-type 'multiline))
-				(setq cperl-font-lock-multiline-start nil)))
-			(3 font-lock-variable-name-face))))
-		  (t '("^[ \t{}]*\\(state\\|my\\|local\\|our\\)[ \t]*\\(([ \t]*\\)?\\([$@%*][a-zA-Z0-9_:]+\\)"
-		       3 font-lock-variable-name-face)))
-	    '("\\<for\\(each\\)?\\([ \t]+\\(state\\|my\\|local\\|our\\)\\)?[ \t]*\\(\\$[a-zA-Z_][a-zA-Z_0-9]*\\)[ \t]*("
+	    `(,(rx (sequence (or "state" "my" "local" "our"))
+                   (eval cperl--ws*-rx)
+                   (opt (sequence "(" (eval cperl--ws*-rx)))
+                   (group
+                    (in "$@%*")
+                    (or
+                     (eval cperl--normal-identifier-rx)
+                     (eval cperl--special-identifier-rx))
+                    )
+                   )
+              ;; (concat "\\<\\(state\\|my\\|local\\|our\\)"
+	      ;;          cperl-maybe-white-and-comment-rex
+	      ;;          "\\(("
+	      ;;          cperl-maybe-white-and-comment-rex
+	      ;;          "\\)?\\([$@%*]\\([a-zA-Z0-9_:]+\\|[^a-zA-Z0-9_]\\)\\)")
+	      ;; (5 ,(if cperl-font-lock-multiline
+	      (1 ,(if cperl-font-lock-multiline
+		      'font-lock-variable-name-face
+		    '(progn  (setq cperl-font-lock-multiline-start
+				   (match-beginning 0))
+			     'font-lock-variable-name-face)))
+	      (,(rx (sequence point
+                              (eval cperl--ws*-rx)
+                              ","
+                              (eval cperl--ws*-rx)
+                              (group
+                               (in "$@%*")
+                               (or
+                                (eval cperl--normal-identifier-rx)
+                                (eval cperl--special-identifier-rx))
+                               )
+                              )
+                    )
+               ;; ,(concat "\\="
+	       ;;  	cperl-maybe-white-and-comment-rex
+	       ;;  	","
+	       ;;  	cperl-maybe-white-and-comment-rex
+	       ;;  	"\\([$@%*]\\([a-zA-Z0-9_:]+\\|[^a-zA-Z0-9_]\\)\\)")
+	       ;; Bug in font-lock: limit is used not only to limit
+	       ;; searches, but to set the "extend window for
+	       ;; facification" property.  Thus we need to minimize.
+	       ,(if cperl-font-lock-multiline
+		    '(if (match-beginning 1)
+			 (save-excursion
+			   (goto-char (match-beginning 1))
+			   (condition-case nil
+			       (forward-sexp 1)
+			     (error
+			      (condition-case nil
+				  (forward-char 200)
+				(error nil)))) ; typeahead
+			   (1- (point))) ; report limit
+		       (forward-char -2)) ; disable continued expr
+		  '(if (match-beginning 1)
+		       (point-max) ; No limit for continuation
+		     (forward-char -2))) ; disable continued expr
+	       ,(if cperl-font-lock-multiline
+		    nil
+		  '(progn	; Do at end
+		     ;; "my" may be already fontified (POD),
+		     ;; so cperl-font-lock-multiline-start is nil
+		     (if (or (not cperl-font-lock-multiline-start)
+			     (> 2 (count-lines
+				   cperl-font-lock-multiline-start
+				   (point))))
+			 nil
+		       (put-text-property
+			(1+ cperl-font-lock-multiline-start) (point)
+			'syntax-type 'multiline))
+		     (setq cperl-font-lock-multiline-start nil)))
+	       (1 font-lock-variable-name-face)))
+            ;; foreach my $foo (
+            `(,(rx symbol-start "for" (opt "each")
+                   (opt (sequence (1+ blank)
+                                  (or "state" "my" "local" "our")))
+                   (0+ blank)
+                   (group-n 1 (sequence "$"
+                                        (eval cperl--basic-identifier-rx)))
+                   (0+ blank) "(")
+;;	    '("\\<for\\(each\\)?\\([ \t]+\\(state\\|my\\|local\\|our\\)\\)?[ \t]*\\(\\$[a-zA-Z_][a-zA-Z_0-9]*\\)[ \t]*("
 	      4 font-lock-variable-name-face)
 	    ;; Avoid $!, and s!!, qq!! etc. when not fontifying syntactically
 	    '("\\(?:^\\|[^smywqrx$]\\)\\(!\\)" 1 font-lock-negation-char-face)
 	    '("\\[\\(\\^\\)" 1 font-lock-negation-char-face prepend)))
 	  (setq
 	   t-font-lock-keywords-1
-	   '(
-	     ("\\(\\([@%]\\|\\$#\\)[a-zA-Z_:][a-zA-Z0-9_:]*\\)" 1
+	   `(
+             ;; arrays and hashes.  Access to elements is fixed below
+             (,(rx (group-n 1 (group-n 2 (or (in "@%") "$#"))
+                            (eval cperl--normal-identifier-rx)))
+              1
+;;	     ("\\(\\([@%]\\|\\$#\\)[a-zA-Z_:][a-zA-Z0-9_:]*\\)" 1
 	      (if (eq (char-after (match-beginning 2)) ?%)
 		  'cperl-hash-face
 		'cperl-array-face)
 	      nil)			; arrays and hashes
-	     ("\\(\\([$@%]+\\)[a-zA-Z_:][a-zA-Z0-9_:]*\\)[ \t]*\\([[{]\\)"
+             ;; access to array/hash elements
+             (,(rx (group-n 1 (group-n 2 (in "$@%"))
+                            (eval cperl--normal-identifier-rx))
+                   (0+ blank)
+                   (group-n 3 (in "[{")))
+;;	     ("\\(\\([$@%]+\\)[a-zA-Z_:][a-zA-Z0-9_:]*\\)[ \t]*\\([[{]\\)"
 	      1
 	      (if (= (- (match-end 2) (match-beginning 2)) 1)
 		  (if (eq (char-after (match-beginning 3)) ?{)
 		      'cperl-hash-face
 		    'cperl-array-face)             ; arrays and hashes
 		font-lock-variable-name-face)      ; Just to put something
-	      t)
-	     ("\\(@\\|\\$#\\)\\(\\$+\\([a-zA-Z_:][a-zA-Z0-9_:]*\\|[^ \t\n]\\)\\)"
+	      t)                                   ; override previous
+             ;; @$ array dereferences, $#$ last array index
+             (,(rx (group-n 1 (or "@" "$#"))
+                   (group-n 2 (sequence "$"
+                                        (or (eval cperl--normal-identifier-rx)
+                                            (not (in " \t\n"))))))
+	     ;; ("\\(@\\|\\$#\\)\\(\\$+\\([a-zA-Z_:][a-zA-Z0-9_:]*\\|[^ \t\n]\\)\\)"
 	      (1 'cperl-array-face)
 	      (2 font-lock-variable-name-face))
-	     ("\\(%\\)\\(\\$+\\([a-zA-Z_:][a-zA-Z0-9_:]*\\|[^ \t\n]\\)\\)"
+             ;; %$ hash dereferences
+             (,(rx (group-n 1 "%")
+                   (group-n 2 (sequence "$"
+                                        (or (eval cperl--normal-identifier-rx)
+                                            (not (in " \t\n"))))))
+	     ;; ("\\(%\\)\\(\\$+\\([a-zA-Z_:][a-zA-Z0-9_:]*\\|[^ \t\n]\\)\\)"
 	      (1 'cperl-hash-face)
 	      (2 font-lock-variable-name-face))
 ;;("\\([smy]\\|tr\\)\\([^a-z_A-Z0-9]\\)\\(\\([^\n\\]*||\\)\\)\\2")
@@ -6435,6 +6554,8 @@ Will not move the position at the start to the left."
       (indent-region beg end nil)
       (goto-char beg)
       (setq col (current-column))
+      ;; Assuming that lineup is done on Perl syntax, this regexp
+      ;; doesn't need to be unicode aware -- haj, 2021-09-10
       (if (looking-at "[a-zA-Z0-9_]")
 	  (if (looking-at "\\<[a-zA-Z0-9_]+\\>")
 	      (setq search
@@ -6472,6 +6593,9 @@ Will not move the position at the start to the left."
   "Run etags with appropriate options for Perl files.
 If optional argument ALL is `recursive', will process Perl files
 in subdirectories too."
+  ;; Apparently etags doesn't support UTF-8 encoded sources, and usage
+  ;; of etags has been commented out in the menu since ... well,
+  ;; forever.  So, let's just stick to ASCII here. -- haj, 2021-09-14
   (interactive)
   (let ((cmd "etags")
 	(args `("-l" "none" "-r"
@@ -6611,6 +6735,9 @@ Does not move point."
     ;; Search for the function
     (progn ;;save-match-data
       (while (re-search-forward
+              ;; FIXME: Should XS code be unicode aware?  Recent C
+              ;; compilers (Gcc 10+) are, but I guess this isn't used
+              ;; much. -- haj, 2021-09-14
 	      "^\\([ \t]*MODULE\\>[^\n]*\\<PACKAGE[ \t]*=[ \t]*\\([a-zA-Z_][a-zA-Z_0-9:]*\\)\\>\\|\\([a-zA-Z_][a-zA-Z_0-9]*\\)(\\|[ \t]*BOOT:\\)"
 	      nil t)
 	(cond
@@ -6673,7 +6800,7 @@ Does not move point."
 	(setq lst
 	      (mapcar
                (lambda (elt)
-                 (cond ((string-match "^[_a-zA-Z]" (car elt))
+                 (cond ((string-match (rx line-start (or alpha "_")) (car elt))
                         (goto-char (cdr elt))
                         (beginning-of-line) ; pos should be of the start of the line
                         (list (car elt)
@@ -6703,9 +6830,14 @@ Does not move point."
 			","
 			(number-to-string (1- (elt elt 1))) ; Char pos 0-based
 			"\n")
-		(if (and (string-match "^[_a-zA-Z]+::" (car elt))
-			 (string-match (concat "^" cperl-sub-regexp "[ \t]+\\([_a-zA-Z]+\\)[^:_a-zA-Z]")
-				       (elt elt 3)))
+		(if (and (string-match (rx line-start
+                                           (eval cperl--basic-identifier-rx) "++")
+                                       (car elt))
+                         (string-match (rx-to-string `(sequence line-start
+                                                                (regexp ,cperl-sub-regexp)
+                                                                (1+ (in " \t"))
+                                                                ,cperl--normal-identifier-rx))
+                                       (elt elt 3)))
 		    ;; Need to insert the name without package as well
 		    (setq lst (cons (cons (substring (elt elt 3)
 						     (match-beginning 1)
@@ -7155,14 +7287,14 @@ Currently it is tuned to C and Perl syntax."
   ;;(concat "\\("
   (mapconcat
    #'identity
-   '("[$@%*&][0-9a-zA-Z_:]+\\([ \t]*[[{]\\)?" ; Usual variable
+   '("[$@%*&][[:alnum:]_:]+\\([ \t]*[[{]\\)?" ; Usual variable
      "[$@]\\^[a-zA-Z]"			; Special variable
      "[$@][^ \n\t]"			; Special variable
      "-[a-zA-Z]"			; File test
      "\\\\[a-zA-Z0]"			; Special chars
      "^=[a-z][a-zA-Z0-9_]*"		; POD sections
      "[-!&*+,./<=>?\\^|~]+"		; Operator
-     "[a-zA-Z_0-9:]+"			; symbol or number
+     "[[:alnum:]_:]+"			; symbol or number
      "x="
      "#!")
    ;;"\\)\\|\\("
@@ -7178,7 +7310,7 @@ Currently it is tuned to C and Perl syntax."
   ;; Does not save-excursion
   ;; Get to the something meaningful
   (or (eobp) (eolp) (forward-char 1))
-  (re-search-backward "[-a-zA-Z0-9_:!&*+,./<=>?\\^|~$%@]"
+  (re-search-backward "[-[:alnum:]_:!&*+,./<=>?\\^|~$%@]"
 		      (point-at-bol)
 		      'to-beg)
   ;;  (cond
@@ -7187,8 +7319,8 @@ Currently it is tuned to C and Perl syntax."
   ;;    (or (bobp) (backward-char 1))))
   ;; Try to backtrace
   (cond
-   ((looking-at "[a-zA-Z0-9_:]")	; symbol
-    (skip-chars-backward "a-zA-Z0-9_:")
+   ((looking-at "[[:alnum:]_:]")	; symbol
+    (skip-chars-backward "[:alnum:]_:")
     (cond
      ((and (eq (preceding-char) ?^)	; $^I
 	   (eq (char-after (- (point) 2)) ?\$))
@@ -7199,7 +7331,7 @@ Currently it is tuned to C and Perl syntax."
 	   (eq (current-column) 1))
       (forward-char -1)))		; =head1
     (if (and (eq (preceding-char) ?\<)
-	     (looking-at "\\$?[a-zA-Z0-9_:]+>")) ; <FH>
+             (looking-at "\\$?[[:alnum:]_:]+>")) ; <FH>
 	(forward-char -1)))
    ((and (looking-at "=") (eq (preceding-char) ?x)) ; x=
     (forward-char -1))
@@ -7212,15 +7344,15 @@ Currently it is tuned to C and Perl syntax."
 	   (not (eq (char-after (- (point) 2)) ?\$))) ; $-
       (forward-char -1))
      ((and (eq (following-char) ?\>)
-	   (string-match "[a-zA-Z0-9_]" (char-to-string (preceding-char)))
+	   (string-match "[[:alnum:]_]" (char-to-string (preceding-char)))
 	   (save-excursion
 	     (forward-sexp -1)
 	     (and (eq (preceding-char) ?\<)
-		  (looking-at "\\$?[a-zA-Z0-9_:]+>")))) ; <FH>
+		  (looking-at "\\$?[[:alnum:]_:]+>")))) ; <FH>
       (search-backward "<"))))
    ((and (eq (following-char) ?\$)
 	 (eq (preceding-char) ?\<)
-	 (looking-at "\\$?[a-zA-Z0-9_:]+>")) ; <$fh>
+	 (looking-at "\\$?[[:alnum:]_:]+>")) ; <$fh>
     (forward-char -1)))
   (if (looking-at cperl-have-help-regexp)
       (buffer-substring (match-beginning 0) (match-end 0))))
-- 
cgit v1.2.3