Friday, February 19, 2010

Reading variable-length lines from a text file in C

Suppose for each line of some text file you want to read the entire line into a buffer and do some processing on it.

A common approach is to choose a fixed maximum length and hope for the best with fgets, but such a program breaks if any line's length is greater than this arbitrary limit.

The program below handles all the edge cases concomitant with fgets:

#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* argv[0], potentially used in error messages */
const char *progname;

typedef void (*processor)(const char *s);
int for_each_line(const char *path, processor p);

/* simple processor that prints each line to the standard output */
void print(const char *s)
{
  printf("%s\n", s);
}

int main(int argc, char **argv)
{
  progname = argv[0];

  if (argc != 2) {
    fprintf(stderr, "Usage: %s file\n", progname);
    return 1;
  }

  return for_each_line(argv[1], print) ? 0 : 1;
}

int for_each_line(const char *path, processor p)
{
  FILE *f;
  char *buf, *line;
  size_t capacity = 80;  /* reasonable guess at max length */
  size_t remaining = capacity;
  int success = 1;

  f = fopen(path, "r");
  if (!f) {
    fprintf(stderr, "%s: open %s: %s\n",
                    progname, path, strerror(errno));
    return 0;
  }

  line = malloc(capacity);
  if (!line) {
    fprintf(stderr, "%s: malloc: %s\n", progname, strerror(errno));
    fclose(f);
    return 0;
  }

  /*
   * On each iteration, read into buf the rest of a line whose length
   * is at most remaining. We can be certain that we have the whole
   * line only when the string contains '\n', in which case we
   * remove the terminator and call the processor on the entire line.
   *
   * Otherwise, we double line's size and try again.
   *
   * It may seem tempting to also test feof(f) to check whether we
   * have the whole line, but in the unlucky edge case where a file
   * doesn't end with '\n' and its last line is exactly remaining-1
   * in length, feof(f) will not yet be true, hence the possibility
   * of printing the last line outside the loop.
   */
  buf = line;
  line[0] = '\0';
  while (fgets(buf, remaining, f)) {
    char *eol = strchr(buf, '\n');
    if (eol) {
      *eol = '\0';
      p(line);
      buf = line;
      remaining = capacity;
      line[0] = '\0';
    }
    else {
      size_t used = buf + remaining - line;

      line = realloc(line, capacity * 2);
      if (!line) {
        fprintf(stderr, "%s: realloc: %s\n", progname, strerror(errno));
        fclose(f);
        return 0;
      }

      buf = line + used - 1;
      capacity *= 2;
      remaining = capacity - used;
    }
  }

  if (errno) {
    fprintf(stderr, "%s: fgets: %s\n", progname, strerror(errno));
    success = 0;
  }
  else if (line[0]) {
    char *eol = strchr(buf, '\n');
    if (eol)
      *eol = '\0';
    p(line);
  }

  fclose(f);
  free(line);

  return success;
}

No comments: