Tuesday, September 20, 2005

Simple expression parser in C#

In microsoft.public.dotnet.languages.csharp, Holger Kasten asked how to extract variables and functions from an expression.

The followups suggested trying various regular expressions, and this approach is often good enough, especially for one-off utilities. It's always important to remember, however, that regular expressions -- even with their many extensions that actually make them more powerful than what theoretical computer scientists call regular expressions -- are not parsers. Attempting to parse, for example, moderately sophisticated HTML using regular expressions can land you in an ugly mess.

Although likely more than what Uncle Bob had in mind when he wrote about practicing our craft through kata, as an exercise I wrote a parser for Holger's problem.

I had the benefit of standing on the shoulders of a giant. For my birthday this year, I asked my wife for a copy of Higher-Order Perl by the brilliant Mark Jason Dominus. In Chapter 8, he wrote about parsers, and I translated the Perl parser code to C#. Note: the code below is a derivative work of code from Higher-Order Perl by Mark Dominus, published by Morgan Kaufmann Publishers, Copyright 2005 by Elsevier Inc.

Almost immediately, the task made me think of Paul Graham's "Succinctness is Power." Comparing the two implementations, you notice right away the higher syntactical overhead of the C#. Once you've appreciated their power, using a language without closures is frustrating!

Although it produces the desired result, I'm afraid I botched the translation. Uncomment the WriteLine in TagVarOrFunction, and you'll see each identifier seems to be tagged multiple times. I expected one call to TagVarOrFunction per identifier.

Enjoy!

Main.cs

using System;
using System.Collections;
using System.IO;
using System.Text.RegularExpressions;

namespace ExpressionParser
{
  enum Tokens
  {
    Terminator, Integer, Identifier, Operator, Whitespace,
  };

  class Application
  {
    static Hashtable var  = new Hashtable();
    static Hashtable func = new Hashtable();

    [STAThread]
    static void Main(string[] args)
    {
      SimpleInput expr = new SimpleInput("x+y+foo(z)*abc(fun(a+bbb)*y)");

      Lexer l = new Lexer(
        expr,
        new Token(Tokens.Terminator, @";\n*|\n+"),
        new Token(Tokens.Integer,    @"\b\d+\b"),
        new Token(Tokens.Identifier, @"[A-Za-z_]\w*"),
        new Token(Tokens.Operator,   @"\*\*|[-=+*/()]"),
        new Token(Tokens.Whitespace, @"\s+", Lexer.Ignore));

      ParserStub exprstub = new ParserStub(new StubToReal(GetExpression));
      ParserStub termstub = new ParserStub(new StubToReal(GetTerm));
      ParserStub factstub = new ParserStub(new StubToReal(GetFactor));

      expression = new Alternate(
        new T(new Concatenate(termstub, new LookFor(Tokens.Operator, "+"), exprstub),
              new ValuesTransform(OpFirst)),
        termstub);

      term = new Alternate(
        new T(new Concatenate(factstub, new LookFor(Tokens.Operator, "*"), termstub),
              new ValuesTransform(OpFirst)),
        factstub);

      Parser open  = new LookFor(Tokens.Operator, "(");
      Parser close = new LookFor(Tokens.Operator, ")");
      Parser id    = new LookFor(Tokens.Identifier);

      factor = new Alternate(
        new T(
          new Concatenate(
            new LookFor(Tokens.Identifier),
            new Alternate(
              new Concatenate(open, exprstub, close),
              new Nothing())),
          new ValuesTransform(TagVarOrFunction)),
        new T(new Concatenate(open, exprstub, close),
              new ValuesTransform(StripParens)));

      Parser entire =
        new T(new Concatenate(exprstub, new EndOfInput()),
              new ValuesTransform(OnlyExpression));

      l.MoveNext();
      ParserState state = entire.Parse(l.ToStream());

      if (!state.Success)
        Console.WriteLine("Parse failed");
      else
      {
        string[] vars = (string[]) new ArrayList(var.Keys).ToArray(typeof(string));
        Console.WriteLine("Variables: " + String.Join(" ", vars));

        string[] funcs = (string[]) new ArrayList(func.Keys).ToArray(typeof(string));
        Console.WriteLine("Functions: " + String.Join(" ", funcs));
      }
    }

    static void Hop_8_4()
    {
      SimpleInput expr = new SimpleInput("2 * (3 + 4)");
      Lexer l = new Lexer(
        expr,
        new Token(Tokens.Terminator, @";\n*|\n+"),
        new Token(Tokens.Integer,    @"\b\d+\b"),
        new Token(Tokens.Identifier, @"[A-Za-z_]\w*"),
        new Token(Tokens.Operator,   @"\*\*|[-=+*/()]"),
        new Token(Tokens.Whitespace, @"\s+", Lexer.Ignore));

      ParserStub exprstub = new ParserStub(new StubToReal(GetExpression));
      ParserStub termstub = new ParserStub(new StubToReal(GetTerm));
      ParserStub factstub = new ParserStub(new StubToReal(GetFactor));

      expression = new Alternate(
        new T(new Concatenate(termstub, new LookFor(Tokens.Operator, "+"), exprstub),
              new ValuesTransform(OpFirst)),
        termstub);

      term = new Alternate(
        new T(new Concatenate(factstub, new LookFor(Tokens.Operator, "*"), termstub),
              new ValuesTransform(OpFirst)),
        factstub);

      factor = new Alternate(
        new LookFor(Tokens.Integer),
        new T(new Concatenate(new LookFor(Tokens.Operator, "("), exprstub, new LookFor(Tokens.Operator, ")")),
              new ValuesTransform(StripParens)));

      Parser entire =
        new T(new Concatenate(exprstub, new EndOfInput()),
              new ValuesTransform(OnlyExpression));

      l.MoveNext();
      ParserState state = entire.Parse(l.ToStream());
      Console.WriteLine(state);
    }

    static private Parser expression;
    static private Parser GetExpression() { return expression; }

    static private Parser term;
    static private Parser GetTerm() { return term; }

    static private Parser factor;
    static private Parser GetFactor() { return factor; }

    static private ArrayList OnlyExpression(ArrayList values)
    {
      ArrayList result = new ArrayList(1);
      result.Add(values[0]);
      return result;
    }

    static private ArrayList StripParens(ArrayList values)
    {
      ArrayList result = new ArrayList(1);
      result.Add(values[1]);
      return result;
    }

    static private ArrayList OpFirst(ArrayList values)
    {
      ArrayList result = new ArrayList(3);
      result.Add(values[1]);
      result.Add(values[0]);
      result.Add(values[2]);
      return result;
    }

    static private ArrayList TagVarOrFunction(ArrayList values)
    {
      //Console.WriteLine("tag called: values[0] = '" + values[0] + "'; values.Count = " + values.Count);
      ArrayList result = new ArrayList();

      if (values.Count == 1)
      {
        var[values[0]] = null;
        result.Add(values[0]);
      }
      else
      {
        func[values[0]] = null;
        result.Add(values[0]);
        result.Add(((ArrayList)values[1])[1]);
      }

      return result;
    }
  }
}

Lexer.cs

using System;
using System.Collections;
using System.Text.RegularExpressions;

namespace ExpressionParser
{
  class Lexer : IEnumerator
  {
    private IEnumerator t;

    public Lexer(IEnumerator input, params Token[] tokens)
    {
      t = input;

      for (int i = 0; i < tokens.Length; ++i)
        t = new Tokenizer(t, tokens[i]);
    }

    public object Current
    {
      get { return t.Current; }
    }

    public bool MoveNext()
    {
      return t.MoveNext();
    }

    public void Reset()
    {
      throw new InvalidOperationException();
    }

    static TokenMaker ignore = new TokenMaker(Tokenizer.ignoretoken);
    public static TokenMaker Ignore
    {
      get { return ignore; }
    }
    
    public StreamNode ToStream()
    {
      return Lexer.ToStream(this.t);
    }

    static public StreamNode ToStream(IEnumerator input)
    {
      object v = input.Current;

      if (v == null)
        return new StreamNode(null, null);

      PromiseMaker pm = new PromiseMaker(input);
      return new StreamNode(v, pm.Promise);
    }

    class PromiseMaker
    {
      private IEnumerator input;

      public PromiseMaker(IEnumerator input)
      {
        this.input = input;
      }

      private StreamNode MakeGood()
      {
        if (input.MoveNext())
          return Lexer.ToStream(input);
        else
          return null;
      }

      public Promise Promise
      {
        get { return new Promise(MakeGood); }
      }
    }

    class Tokenizer : IEnumerator
    {
      private IEnumerator input;
      private string buf;
      private Tokens label;
      private Regex pattern;
      private TokenMaker mktoken;
      private Queue tokens;

      internal static object ignoretoken(string ignored) { return ""; }

      public Tokenizer(IEnumerator input, Token token)
      {
        this.input = input;

        buf = "";
        tokens = new Queue();

        this.label = token.label;
        this.pattern = new Regex("(" + token.s + ")");
        this.mktoken = token.create != null ? token.create : new TokenMaker(this.MakeToken);
      }

      private string[] Split()
      {
        return pattern.Split(buf);
      }

      public object Current
      {
        get { return tokens.Peek(); }
      }

      private void FlushBuffer(Token token)
      {
        string[] parts = Split();

        // separator, if present
        MaybeEnqueue(parts[0]);

        // the lexeme we're after
        if (parts.Length > 1)
          MaybeEnqueue(mktoken(parts[1]));

        tokens.Enqueue(token);

        buf = "";
      }

      private void AppendToBuffer(object chunk)
      {
        if (chunk != null)
          buf = buf + (string)chunk;
      }

      private string TokenizeRemainingInput(object chunk)
      {
        AppendToBuffer(chunk);
        ArrayList newtoks = new ArrayList(Split());

        while (newtoks.Count > 2 || newtoks.Count > 0 && chunk != null)
        {
          MaybeEnqueue(Shift(newtoks));

          if (newtoks.Count > 0)
            MaybeEnqueue(mktoken((string)Shift(newtoks)));
        }

        if (chunk == null)
          return null;
        else
          return String.Join("", (string[])newtoks.ToArray(typeof(string)));
      }

      private object Shift(ArrayList a)
      {
        object head = a[0];
        a.RemoveAt(0);

        return head;
      }

      private void MaybeEnqueue(object o)
      {
        if (o is string && (string) o == "")
          return;

        tokens.Enqueue(o);
      }

      public bool MoveNext()
      {
        if (tokens.Count > 0)
        {
          tokens.Dequeue();
          if (tokens.Count > 0)
            return true;
        }

        while (tokens.Count == 0 && buf != null)
        {
          if (!input.MoveNext())
            return false;

          object i = input.Current;

          if (i is Token)
          {
            FlushBuffer((Token)i);
            break;
          }

          buf = TokenizeRemainingInput(i);
        }

        return true;
      }

      public void Reset()
      {
        throw new InvalidOperationException();
      }

      public object MakeToken(string token)
      {
        return new Token(this.label, token);
      }
    }
  }
}

Parser.cs

using System;
using System.Collections;

namespace ExpressionParser
{
  public struct ParserState
  {
    public readonly bool Success;
    public readonly ArrayList Values;
    public readonly StreamNode RemainingInput;

    public ParserState(ArrayList values, StreamNode rest)
    {
      this.Values = values;
      this.RemainingInput = rest;
      this.Success = true;
    }

    public ParserState(bool assumedFalse)
    {
      this.Success = false;
      this.Values = null;
      this.RemainingInput = null;
    }

    public override string ToString()
    {
      if (Success)
        //return "[" + String.Join("][", (string[])Values.ToArray(typeof(string))) + "]";
        return ShowTree(Values, "");
      else
        return "FAILED";
    }

    private static string ShowTree(ArrayList tree, string indent)
    {
      System.Text.StringBuilder result = new System.Text.StringBuilder();

      result.Append(indent + "[\n");

      foreach (object o in tree)
      {
        if (o is ArrayList)
          result.Append(ShowTree((ArrayList) o, indent + "  "));
        else
          result.Append(indent + "'" + o + "'\n");
      }

      result.Append(indent + "]\n");
      
      return result.ToString();
    }
  }

  public abstract class Parser
  {
    public abstract ParserState Parse(StreamNode input);

    // for eta-conversion
    public virtual Parser GetParser()
    {
      return this;
    }

    public void Trying(Parser p)
    {
      //Console.WriteLine("trying " + p + "...");
    }

    public ParserState Success(ArrayList values, StreamNode rest)
    {
      ParserState success = new ParserState(values, rest);
      //Console.WriteLine("SUCCESS: " + success.ToString());
      return success;
    }

    public ParserState Failure()
    {
      //Console.WriteLine(this + ": FAILURE");
      return new ParserState(false);
    }

    private string name = null;
    public string Name
    {
      get { return name; }
      set { name = value; }
    }

    public override string ToString()
    {
      if (Name != null)
        return Name;
      else
        return GetType().ToString();
    }
  }

  class Nothing : Parser
  {
    public override ParserState Parse(StreamNode input)
    {
      return Success(new ArrayList(), input);
    }

    public override string ToString()
    {
      return "Nothing";
    }
  }

  class EndOfInput : Parser
  {
    public override ParserState Parse(StreamNode input)
    {
      if (input == null || input.Head == null)
        return Success(new ArrayList(), input);
      else
        return Failure();
    }

    public override string ToString()
    {
      return "EndOfInput";
    }
  }

  class LookFor : Parser
  {
    Tokens lookfor;
    string s;

    public LookFor(Tokens lookfor, string s)
    {
      this.lookfor = lookfor;
      this.s = s;
    }

    public LookFor(Tokens lookfor) : this(lookfor, null) {}

    public override string ToString()
    {
      return "LookFor(" + lookfor + ",'" + s + "')";
    }

    public override ParserState Parse(StreamNode input)
    {
      if (input == null)
        return Failure();

      Token t = (Token)input.Head;

      if (t.label != lookfor)
        return Failure();

      if (s != null && t.s != s)
        return Failure();

      ArrayList values = new ArrayList();
      values.Add(t.s);

      return Success(values, input.Tail);
    }
  }

  class Concatenate : Parser
  {
    private Parser[] p;

    public Concatenate(params Parser[] p)
    {
      if (p.Length == 0)
        this.p = new Nothing[1];
      else
        this.p = p;
    }

    public override string ToString()
    {
      if (Name == null)
      {
        string[] parsers = new string[p.Length];
        for (int i = 0; i < p.Length; i++)
          parsers[i] = p[i].ToString();

        return String.Join(" ", parsers);
      }
      else
        return base.ToString();
    }

    public override ParserState Parse(StreamNode input)
    {
      if (p.Length == 1)
        return p[0].Parse(input);

      ArrayList values = new ArrayList();

      foreach (Parser parser in p)
      {
        Trying(parser);
        ParserState state = parser.Parse(input);
        if (!state.Success)
          return Failure();

        if (state.Values.Count <= 1)
          values.AddRange(state.Values);
        else
          values.Add(state.Values);

        input = state.RemainingInput;
      }

      return Success(values, input);
    }
  }

  class Alternate : Parser
  {
    private Parser[] p;

    public Alternate(params Parser[] p)
    {
      if (p.Length == 0)
        this.p = null;
      else
        this.p = p;
    }

    public override ParserState Parse(StreamNode input)
    {
      if (p == null)
        return Failure();

      foreach (Parser parser in p)
      {
        ParserState state = parser.Parse(input);
        if (state.Success)
          return state;
      }

      return Failure();
    }
  }

  class Star : Parser
  {
    protected Parser parser;

    public Star(Parser p)
    {
      ParserStub stub = new ParserStub(new StubToReal(GetParser));

      parser = new Alternate(new Concatenate(p, stub), new Nothing());
    }

    public override ParserState Parse(StreamNode input)
    {
      ParserState state = parser.Parse(input);
      if (state.Success)
        return state;
      else
        return Failure();
    }

    public override Parser GetParser()
    {
      return parser;
    }
  }

  public delegate Parser StubToReal();

  // eta-converted parser
  public class ParserStub : Parser
  {
    StubToReal p;

    public ParserStub(StubToReal p)
    {
      this.p = p;
    }

    public override ParserState Parse(StreamNode input)
    {
      return p().Parse(input);
    }
  }
  
  public delegate ArrayList ValuesTransform(ArrayList values);
  public class T : Parser
  {
    private Parser p;
    private ValuesTransform map;

    public T(Parser p, ValuesTransform map)
    {
      this.p = p;
      this.map = map;
    }

    public override ParserState Parse(StreamNode input)
    {
      ParserState state = p.Parse(input);

      if (state.Success)
        return Success(map(state.Values), state.RemainingInput);
      else
        return Failure();
    }
  }
}

Token.cs

using System;
using System.Collections;
using System.Text;

namespace ExpressionParser
{
  delegate object TokenMaker(string s);

  struct Token
  {
    public readonly Tokens label;
    public readonly string s;
    public readonly TokenMaker create;

    public Token(Tokens label, string s)
    {
      this.label = label;
      this.s = s;
      this.create = null;
    }

    public Token(Tokens label, string s, TokenMaker create) : this(label, s)
    {
      this.create = create;
    }

    public override string ToString()
    {
      return "'" + s + "' (" + label + ")";
    }
  };
}

StreamNode.cs

using System;

namespace ExpressionParser
{
  public delegate StreamNode Promise();

  public class StreamNode
  {
    private object head;
    private object tail;

    public StreamNode(object head, Promise tail)
    {
      this.head = head;
      this.tail = tail;
    }

    public object Head
    {
      get { return head; }
    }

    public StreamNode Tail
    {
      get
      {
        if (tail is Promise)
          tail = ((Promise) tail)();

        return (StreamNode) tail;
      }
    }
  }
}

SimpleInput.cs

using System;
using System.Collections;

namespace ExpressionParser
{
  class SimpleInput : IEnumerator
  {
    private readonly string orig;
    private string val;
    private bool done;

    public SimpleInput(string line)
    {
      this.orig = line;
      this.val = null;
      this.done = false;
    }

    public void Reset() { done = false; }

    public object Current
    {
      get { return val; }
    }

    public bool MoveNext()
    {
      if (!done && val == null)
      {
        val = orig;
        return true;
      }
      else
      {
        done = true;
        val = null;
        return false;
      }
    }
  }
}

No comments: