# # Program to build a regex to match an internet email address, # from Chapter 7 of _Mastering Regular Expressions_ (Friedl / O'Reilly) # (http://www.ora.com/catalog/regexp/) # # Unoptimized version. # # Copyright 1997 O'Reilly & Associates, Inc. # # Some things for avoiding backslashitis later on. $esc = '\\\\'; $Period = '\.'; $space = '\040'; $tab = '\t'; $OpenBR = '\['; $CloseBR = '\]'; $OpenParen = '\('; $CloseParen = '\)'; $NonASCII = '\x80-\xff'; $ctrl = '\000-\037'; $CRlist = '\n\015'; # note: this should really be only \015. # Items 19, 20, 21 $qtext = qq/[^$esc$NonASCII$CRlist\"]/; # for within "..." $dtext = qq/[^$esc$NonASCII$CRlist$OpenBR$CloseBR]/; # for within [...] $quoted_pair = qq< $esc [^$NonASCII] >; # an escaped character # Item 10: atom $atom_char = qq/[^($space)<>\@,;:\".$esc$OpenBR$CloseBR$ctrl$NonASCII]/; $atom = qq< $atom_char+ # some number of atom characters... (?!$atom_char) # ..not followed by something that could be part of an atom >; # Items 22 and 23, comment. # Impossible to do properly with a regex, I make do by allowing at most one level of nesting. $ctext = qq< [^$esc$NonASCII$CRlist()] >; $Cnested = qq< $OpenParen (?: $ctext | $quoted_pair )* $CloseParen >; $comment = qq< $OpenParen (?: $ctext | $quoted_pair | $Cnested )* $CloseParen >; $X = qq< (?: [$space$tab] | $comment )* >; # optional separator # Item 11: doublequoted string, with escaped items allowed $quoted_str = qq< \" (?: # opening quote... $qtext # Anything except backslash and quote | # or $quoted_pair # Escaped something (something != CR) )* \" # closing quote >; # Item 7: word is an atom or quoted string $word = qq< (?: $atom | $quoted_str ) >; # Item 12: domain-ref is just an atom $domain_ref = $atom; # Item 13 domain-literal is like a quoted string, but [...] instead of "..." $domain_lit = qq< $OpenBR # [ (?: $dtext | $quoted_pair )* # stuff $CloseBR # ] >; # Item 9: sub-domain is a domain-ref or domain-literal $sub_domain = qq< (?: $domain_ref | $domain_lit ) >; # Item 6: domain is a list of subdomains separated by dots. $domain = qq< $sub_domain # initial subdomain (?: # $X $Period # if led by a period... $X $sub_domain # ...further okay )* >; # Item 8: a route. A bunch of "@ $domain" separated by commas, followed by a colon $route = qq< \@ $X $domain (?: $X , $X \@ $X $domain )* # further okay, if led by comma : # closing colon >; # Item 5: local-part is a bunch of $word separated by periods $local_part = qq< $word # initial word (?: $X $Period $X $word )* # further okay, if led by a period >; # Item 2: addr-spec is local@domain $addr_spec = qq< $local_part $X \@ $X $domain >; # Item 4: route-addr is $route_addr = qq[ < $X # leading < (?: $route $X )? # optional route $addr_spec # address spec $X > # trailing > ]; # Item 3: phrase $phrase_ctrl = '\000-\010\012-\037'; # like ctrl, but without tab # Like atom-char, but without listing space, and uses phrase_ctrl. # Since the class is negated, this matches the same as atom-char plus space and tab $phrase_char = qq/[^()<>\@,;:\".$esc$OpenBR$CloseBR$NonASCII$phrase_ctrl]/; $phrase = qq< $word # one word, optionally followed by.... (?: $phrase_char | # atom and space parts, or... $comment | # comments, or... $quoted_str # quoted strings )* >; # Item #1: mailbox is an addr_spec or a phrase/route_addr $mailbox = qq< $X # optional leading comment (?: $addr_spec # address | # or $phrase $route_addr # name and address ) $X # optional trailing comment >; ########################################################################### # Here's a little snippet to test it. # Addresses given on the commandline are described. # my $error = 0; my $valid; foreach $address (@ARGV) { $valid = $address =~ m/^$mailbox$/xo; printf "`$address' is syntactically %s.\n", $valid ? "valid" : "invalid"; $error = 1 if not $valid; } exit $error;