Thursday, October 05, 2006

Moral: a regular expression is a poor subsitute for a parser

A famous jwz soundbite is "Some people, when confronted with a problem, think 'I know, I'll use regular expressions.' Now they have two problems."

Snarky, yes, but a good sanity check.

Section 9 of the Perl FAQ addresses the issue of validating email addresses. RFC 2822 gives a specification for Internet email addresses, and the language is much broader than that matched by /^\w+@\w+\.\w+$/

The proper solution is to use a parser. To see why, consider the task of maintaining the following elegant -- but unit-tested! -- solution:

using System;
using System.Text.RegularExpressions;

using NUnit.Framework;

namespace Application
{
  public interface SyntaxChecker
  {
    bool WellFormed(string input);
  }

  public class EmailAddressChecker : SyntaxChecker
  {
    // derivative of work with the following copyright and license:
    // Copyright (c) 2004 Casey West.  All rights reserved.
    // This module is free software; you can redistribute it and/or
    // modify it under the same terms as Perl itself.

    // see http://search.cpan.org/~cwest/Email-Address-1.80/

    #region dumped regular expression
    private static string gibberish = @"
    (?-xism:(?:(?-xism:(?-xism:(?-xism:(?-xism:(?-xism:(?-xism:\
    s*\((?:\s*(?-xism:(?-xism:(?>[^()\\]+))|(?-xism:\\(?-xism:[^
    \x0A\x0D]))|(?-xism:\s*\((?:\s*(?-xism:(?-xism:(?>[^()\\]+))
    |(?-xism:\\(?-xism:[^\x0A\x0D]))|)+)*\s*\)\s*))+)*\s*\)\s*)+
    |\s+)*[^\x00-\x1F\x7F()<>\[\]:;@\,.<DQ>\s]+(?-xism:(?-xism:\
    s*\((?:\s*(?-xism:(?-xism:(?>[^()\\]+))|(?-xism:\\(?-xism:[^
    \x0A\x0D]))|(?-xism:\s*\((?:\s*(?-xism:(?-xism:(?>[^()\\]+))
    |(?-xism:\\(?-xism:[^\x0A\x0D]))|)+)*\s*\)\s*))+)*\s*\)\s*)+
    |\s+)*)|(?-xism:(?-xism:(?-xism:\s*\((?:\s*(?-xism:(?-xism:(
    ?>[^()\\]+))|(?-xism:\\(?-xism:[^\x0A\x0D]))|(?-xism:\s*\((?
    :\s*(?-xism:(?-xism:(?>[^()\\]+))|(?-xism:\\(?-xism:[^\x0A\x
    0D]))|)+)*\s*\)\s*))+)*\s*\)\s*)+|\s+)*<DQ>(?-xism:(?-xism:[
    ^\\<DQ>])|(?-xism:\\(?-xism:[^\x0A\x0D])))+<DQ>(?-xism:(?-xi
    sm:\s*\((?:\s*(?-xism:(?-xism:(?>[^()\\]+))|(?-xism:\\(?-xis
    m:[^\x0A\x0D]))|(?-xism:\s*\((?:\s*(?-xism:(?-xism:(?>[^()\\
    ]+))|(?-xism:\\(?-xism:[^\x0A\x0D]))|)+)*\s*\)\s*))+)*\s*\)\
    s*)+|\s+)*))+)?(?-xism:(?-xism:(?-xism:\s*\((?:\s*(?-xism:(?
    -xism:(?>[^()\\]+))|(?-xism:\\(?-xism:[^\x0A\x0D]))|(?-xism:
    \s*\((?:\s*(?-xism:(?-xism:(?>[^()\\]+))|(?-xism:\\(?-xism:[
    ^\x0A\x0D]))|)+)*\s*\)\s*))+)*\s*\)\s*)+|\s+)*<(?-xism:(?-xi
    sm:(?-xism:(?-xism:(?-xism:\s*\((?:\s*(?-xism:(?-xism:(?>[^(
    )\\]+))|(?-xism:\\(?-xism:[^\x0A\x0D]))|(?-xism:\s*\((?:\s*(
    ?-xism:(?-xism:(?>[^()\\]+))|(?-xism:\\(?-xism:[^\x0A\x0D]))
    |)+)*\s*\)\s*))+)*\s*\)\s*)+|\s+)*(?-xism:[^\x00-\x1F\x7F()<
    >\[\]:;@\,.<DQ>\s]+(?:\.[^\x00-\x1F\x7F()<>\[\]:;@\,.<DQ>\s]
    +)*)(?-xism:(?-xism:\s*\((?:\s*(?-xism:(?-xism:(?>[^()\\]+))
    |(?-xism:\\(?-xism:[^\x0A\x0D]))|(?-xism:\s*\((?:\s*(?-xism:
    (?-xism:(?>[^()\\]+))|(?-xism:\\(?-xism:[^\x0A\x0D]))|)+)*\s
    *\)\s*))+)*\s*\)\s*)+|\s+)*)|(?-xism:(?-xism:(?-xism:\s*\((?
    :\s*(?-xism:(?-xism:(?>[^()\\]+))|(?-xism:\\(?-xism:[^\x0A\x
    0D]))|(?-xism:\s*\((?:\s*(?-xism:(?-xism:(?>[^()\\]+))|(?-xi
    sm:\\(?-xism:[^\x0A\x0D]))|)+)*\s*\)\s*))+)*\s*\)\s*)+|\s+)*
    <DQ>(?-xism:(?-xism:[^\\<DQ>])|(?-xism:\\(?-xism:[^\x0A\x0D]
    )))+<DQ>(?-xism:(?-xism:\s*\((?:\s*(?-xism:(?-xism:(?>[^()\\
    ]+))|(?-xism:\\(?-xism:[^\x0A\x0D]))|(?-xism:\s*\((?:\s*(?-x
    ism:(?-xism:(?>[^()\\]+))|(?-xism:\\(?-xism:[^\x0A\x0D]))|)+
    )*\s*\)\s*))+)*\s*\)\s*)+|\s+)*))\@(?-xism:(?-xism:(?-xism:(
    ?-xism:\s*\((?:\s*(?-xism:(?-xism:(?>[^()\\]+))|(?-xism:\\(?
    -xism:[^\x0A\x0D]))|(?-xism:\s*\((?:\s*(?-xism:(?-xism:(?>[^
    ()\\]+))|(?-xism:\\(?-xism:[^\x0A\x0D]))|)+)*\s*\)\s*))+)*\s
    *\)\s*)+|\s+)*(?-xism:[^\x00-\x1F\x7F()<>\[\]:;@\,.<DQ>\s]+(
    ?:\.[^\x00-\x1F\x7F()<>\[\]:;@\,.<DQ>\s]+)*)(?-xism:(?-xism:
    \s*\((?:\s*(?-xism:(?-xism:(?>[^()\\]+))|(?-xism:\\(?-xism:[
    ^\x0A\x0D]))|(?-xism:\s*\((?:\s*(?-xism:(?-xism:(?>[^()\\]+)
    )|(?-xism:\\(?-xism:[^\x0A\x0D]))|)+)*\s*\)\s*))+)*\s*\)\s*)
    +|\s+)*)|(?-xism:(?-xism:(?-xism:\s*\((?:\s*(?-xism:(?-xism:
    (?>[^()\\]+))|(?-xism:\\(?-xism:[^\x0A\x0D]))|(?-xism:\s*\((
    ?:\s*(?-xism:(?-xism:(?>[^()\\]+))|(?-xism:\\(?-xism:[^\x0A\
    x0D]))|)+)*\s*\)\s*))+)*\s*\)\s*)+|\s+)*\[(?:\s*(?-xism:(?-x
    ism:[^\[\]\\])|(?-xism:\\(?-xism:[^\x0A\x0D])))+)*\s*\](?-xi
    sm:(?-xism:\s*\((?:\s*(?-xism:(?-xism:(?>[^()\\]+))|(?-xism:
    \\(?-xism:[^\x0A\x0D]))|(?-xism:\s*\((?:\s*(?-xism:(?-xism:(
    ?>[^()\\]+))|(?-xism:\\(?-xism:[^\x0A\x0D]))|)+)*\s*\)\s*))+
    )*\s*\)\s*)+|\s+)*)))>(?-xism:(?-xism:\s*\((?:\s*(?-xism:(?-
    xism:(?>[^()\\]+))|(?-xism:\\(?-xism:[^\x0A\x0D]))|(?-xism:\
    s*\((?:\s*(?-xism:(?-xism:(?>[^()\\]+))|(?-xism:\\(?-xism:[^
    \x0A\x0D]))|)+)*\s*\)\s*))+)*\s*\)\s*)+|\s+)*))|(?-xism:(?-x
    ism:(?-xism:(?-xism:(?-xism:\s*\((?:\s*(?-xism:(?-xism:(?>[^
    ()\\]+))|(?-xism:\\(?-xism:[^\x0A\x0D]))|(?-xism:\s*\((?:\s*
    (?-xism:(?-xism:(?>[^()\\]+))|(?-xism:\\(?-xism:[^\x0A\x0D])
    )|)+)*\s*\)\s*))+)*\s*\)\s*)+|\s+)*(?-xism:[^\x00-\x1F\x7F()
    <>\[\]:;@\,.<DQ>\s]+(?:\.[^\x00-\x1F\x7F()<>\[\]:;@\,.<DQ>\s
    ]+)*)(?-xism:(?-xism:\s*\((?:\s*(?-xism:(?-xism:(?>[^()\\]+)
    )|(?-xism:\\(?-xism:[^\x0A\x0D]))|(?-xism:\s*\((?:\s*(?-xism
    :(?-xism:(?>[^()\\]+))|(?-xism:\\(?-xism:[^\x0A\x0D]))|)+)*\
    s*\)\s*))+)*\s*\)\s*)+|\s+)*)|(?-xism:(?-xism:(?-xism:\s*\((
    ?:\s*(?-xism:(?-xism:(?>[^()\\]+))|(?-xism:\\(?-xism:[^\x0A\
    x0D]))|(?-xism:\s*\((?:\s*(?-xism:(?-xism:(?>[^()\\]+))|(?-x
    ism:\\(?-xism:[^\x0A\x0D]))|)+)*\s*\)\s*))+)*\s*\)\s*)+|\s+)
    *<DQ>(?-xism:(?-xism:[^\\<DQ>])|(?-xism:\\(?-xism:[^\x0A\x0D
    ])))+<DQ>(?-xism:(?-xism:\s*\((?:\s*(?-xism:(?-xism:(?>[^()\
    \]+))|(?-xism:\\(?-xism:[^\x0A\x0D]))|(?-xism:\s*\((?:\s*(?-
    xism:(?-xism:(?>[^()\\]+))|(?-xism:\\(?-xism:[^\x0A\x0D]))|)
    +)*\s*\)\s*))+)*\s*\)\s*)+|\s+)*))\@(?-xism:(?-xism:(?-xism:
    (?-xism:\s*\((?:\s*(?-xism:(?-xism:(?>[^()\\]+))|(?-xism:\\(
    ?-xism:[^\x0A\x0D]))|(?-xism:\s*\((?:\s*(?-xism:(?-xism:(?>[
    ^()\\]+))|(?-xism:\\(?-xism:[^\x0A\x0D]))|)+)*\s*\)\s*))+)*\
    s*\)\s*)+|\s+)*(?-xism:[^\x00-\x1F\x7F()<>\[\]:;@\,.<DQ>\s]+
    (?:\.[^\x00-\x1F\x7F()<>\[\]:;@\,.<DQ>\s]+)*)(?-xism:(?-xism
    :\s*\((?:\s*(?-xism:(?-xism:(?>[^()\\]+))|(?-xism:\\(?-xism:
    [^\x0A\x0D]))|(?-xism:\s*\((?:\s*(?-xism:(?-xism:(?>[^()\\]+
    ))|(?-xism:\\(?-xism:[^\x0A\x0D]))|)+)*\s*\)\s*))+)*\s*\)\s*
    )+|\s+)*)|(?-xism:(?-xism:(?-xism:\s*\((?:\s*(?-xism:(?-xism
    :(?>[^()\\]+))|(?-xism:\\(?-xism:[^\x0A\x0D]))|(?-xism:\s*\(
    (?:\s*(?-xism:(?-xism:(?>[^()\\]+))|(?-xism:\\(?-xism:[^\x0A
    \x0D]))|)+)*\s*\)\s*))+)*\s*\)\s*)+|\s+)*\[(?:\s*(?-xism:(?-
    xism:[^\[\]\\])|(?-xism:\\(?-xism:[^\x0A\x0D])))+)*\s*\](?-x
    ism:(?-xism:\s*\((?:\s*(?-xism:(?-xism:(?>[^()\\]+))|(?-xism
    :\\(?-xism:[^\x0A\x0D]))|(?-xism:\s*\((?:\s*(?-xism:(?-xism:
    (?>[^()\\]+))|(?-xism:\\(?-xism:[^\x0A\x0D]))|)+)*\s*\)\s*))
    +)*\s*\)\s*)+|\s+)*))))(?-xism:\s*\((?:\s*(?-xism:(?-xism:(?
    >[^()\\]+))|(?-xism:\\(?-xism:[^\x0A\x0D]))|(?-xism:\s*\((?:
    \s*(?-xism:(?-xism:(?>[^()\\]+))|(?-xism:\\(?-xism:[^\x0A\x0
    D]))|)+)*\s*\)\s*))+)*\s*\)\s*)*)"
      .Replace("<DQ>", "\"")
      .Replace("\t", "")
      .Replace(" ", "")
      .Replace("\r", "")
      .Replace("\n", "");
    #endregion

    private static Regex mailbox =
      new Regex(gibberish, RegexOptions.ExplicitCapture);

    public bool WellFormed(string address)
    {
      return mailbox.IsMatch(address);
    }
  }

  [TestFixture]
  public class Test
  {
    private SyntaxChecker checker = null;

    [SetUp]
    public void SetUp()
    {
      checker = new EmailAddressChecker();
    }

    [Test]
    public void SimpleAddress()
    {
      Assert.IsTrue(checker.WellFormed("foo@example.com"));
    }

    [Test]
    public void MustHaveLocalAndDomain()
    {
      Assert.IsFalse(checker.WellFormed("justlocal"));
    }

    [Test]
    public void AmpersandInLocalPart()
    {
      Assert.IsTrue(checker.WellFormed("\"foo & bar\"@example.com"));
    }

    [Test]
    public void PhraseAndAngles()
    {
      Assert.IsTrue(checker.WellFormed("Joe <joe@example.com>"));
    }

    [Test]
    public void AddressAndComment()
    {
      Assert.IsTrue(checker.WellFormed("joe@example.com (Joe)"));
    }

    [Test]
    public void ContainsSingleQuoteInParenComment()
    {
      Assert.IsTrue(checker.WellFormed("oleary@example.com (Mr. O'Leary)"));
    }

    [Test]
    public void ContainsSingleQuoteInBareComment()
    {
      Assert.IsTrue(checker.WellFormed("Mr. O'Leary <oleary@example.com"));
    }

    [Test]
    public void ContainsSingleQuoteInLocalPart()
    {
      Assert.IsTrue(checker.WellFormed("o'leary@example.com"));
    }
  }
}

No comments: