File: transform.c

package info (click to toggle)
mailfromd 9.1-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 11,512 kB
sloc: ansic: 56,882; sh: 22,979; yacc: 4,130; lex: 1,428; makefile: 928; lisp: 488; awk: 393; perl: 319; sed: 25
file content (880 lines) | stat: -rw-r--r-- 18,560 bytes
/* This file is part of Mailfromd.
   Copyright (C) 2006-2025 Sergey Poznyakoff.
   (using my implementation for the GNU tar and rush).

   Mailfromd is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   Mailfromd is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with Mailfromd.  If not, see <http://www.gnu.org/licenses/>. */

#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#include <stdlib.h>
#include <regex.h>
#include <mailutils/alloc.h>
#include <mailutils/cctype.h>
#include <libmf.h>

enum transform_type
  {
    transform_incomplete,
    transform_first,
    transform_global
  };

enum replace_segm_type
  {
    segm_literal,   /* Literal segment */
    segm_backref,   /* Back-reference segment */
    segm_case_ctl   /* Case control segment (GNU extension) */
  };

enum case_ctl_type
  {
    ctl_stop,       /* Stop case conversion */
    ctl_upcase_next,/* Turn the next character to uppercase */
    ctl_locase_next,/* Turn the next character to lowercase */
    ctl_upcase,     /* Turn the replacement to uppercase until ctl_stop */
    ctl_locase      /* Turn the replacement to lowercase until ctl_stop */
  };

struct replace_segm
{
  struct replace_segm *next;
  enum replace_segm_type type;
  union
  {
    struct
    {
      char *ptr;
      size_t size;
    } literal;                /* type == segm_literal */
    size_t ref;               /* type == segm_backref */
    enum case_ctl_type ctl;   /* type == segm_case_ctl */
  } v;
};

struct transform
{
  struct transform *next;
  enum transform_type transform_type;
  unsigned match_number;
  regex_t regex;
  /* Compiled replacement expression */
  struct replace_segm *repl_head, *repl_tail;
  size_t segm_count; /* Number of elements in the above list */
};

struct transform_list
{
  struct transform *head, *tail;
};

struct transform_error
{
  const char *diag;
  int pos;
  const char *arg;
  void *mem;
};

struct transform_error last_transform_error;

static void
init_transform_error ()
{
  if (last_transform_error.mem)
    free(last_transform_error.mem);
  memset(&last_transform_error, 0, sizeof(last_transform_error));
}

const char *
transform_error_string ()
{
  static char transform_error_buf[1024];

  if (!last_transform_error.diag)
    return NULL;
  if (last_transform_error.arg)
    snprintf (transform_error_buf, sizeof (transform_error_buf),
	      "%s, in \"%s\" pos. %d",
	      last_transform_error.diag,
	      last_transform_error.arg,
	      last_transform_error.pos);
  else
    strncpy (transform_error_buf, last_transform_error.diag,
	     sizeof (transform_error_buf));
  init_transform_error ();
  return transform_error_buf;
}


static struct transform *
new_transform (struct transform_list *tlist)
{
  struct transform *p = mu_zalloc (sizeof *p);
  p->transform_type = transform_incomplete;
  if (tlist->tail)
    tlist->tail->next = p;
  else
    tlist->head = p;
  tlist->tail = p;
  return p;
}

static struct replace_segm *
add_segment (struct transform *tf)
{
  struct replace_segm *segm = mu_alloc (sizeof *segm);
  segm->next = NULL;
  if (tf->repl_tail)
    tf->repl_tail->next = segm;
  else
    tf->repl_head = segm;
  tf->repl_tail = segm;
  tf->segm_count++;
  return segm;
}

static void
add_literal_segment (struct transform *tf, char *str, char *end)
{
  size_t len = end - str;
  if (len)
    {
      struct replace_segm *segm = add_segment (tf);
      segm->type = segm_literal;
      segm->v.literal.ptr = mu_alloc (len + 1);
      memcpy (segm->v.literal.ptr, str, len);
      segm->v.literal.ptr[len] = 0;
      segm->v.literal.size = len;
    }
}

static void
add_char_segment (struct transform *tf, int chr)
{
  struct replace_segm *segm = add_segment (tf);
  segm->type = segm_literal;
  segm->v.literal.ptr = mu_alloc (2);
  segm->v.literal.ptr[0] = chr;
  segm->v.literal.ptr[1] = 0;
  segm->v.literal.size = 1;
}

static void
add_backref_segment (struct transform *tf, size_t ref)
{
  struct replace_segm *segm = add_segment (tf);
  segm->type = segm_backref;
  segm->v.ref = ref;
}

static void
add_case_ctl_segment (struct transform *tf, enum case_ctl_type ctl)
{
  struct replace_segm *segm = add_segment (tf);
  segm->type = segm_case_ctl;
  segm->v.ctl = ctl;
}

void
replace_segm_free (struct replace_segm *segm)
{
  while (segm)
    {
      struct replace_segm *next = segm->next;
      switch (segm->type)
	{
	case segm_literal:
	  free (segm->v.literal.ptr);
	  break;

	case segm_backref:
	case segm_case_ctl:
	  break;
	}
      free (segm);
      segm = next;
    }
}

void
transform_free(struct transform *xform)
{
  while (xform)
    {
      struct transform *next = xform->next;
      if (xform->transform_type != transform_incomplete)
	regfree (&xform->regex);
      replace_segm_free (xform->repl_head);
      free (xform);
      xform = next;
    }
}

static int
skip_named_class (char const *expr, int i)
{
  i++;
  if (expr[i] == ':')
    do
      {
	for (i++; expr[i] != 0 && expr[i] != ':'; i++)
	  {
	    if (expr[i] == '\\')
	      {
		i++;
		if (expr[i] == 0)
		  return i;
	      }
	  }
	if (expr[i] == 0)
	  return i;
	i++;
	if (expr[i] == ']')
	  return i + 1;
      }
    while (expr[i] != 0);

  return i;
}

static int
skip_class (char const *expr, int i)
{
  i++;
  if (expr[i] == ']')
    i++;
  while (expr[i] && expr[i] != ']')
    {
      switch (expr[i])
	{
	case '[':
	  i = skip_named_class (expr, i);
	  break;

	case '\\':
	  if (expr[i+1])
	    i++;
	  /* FALLTHROUGH */
	default:
	  i++;
	}
    }
  return i;
}

static int
parse_transform_expr (struct transform_list *tlist, const char *expr,
		      int cflags, const char **endp)
{
  int delim;
  int i, j, rc;
  char *str, *beg, *cur;
  const char *p;
  struct transform *tf = new_transform (tlist);
  enum transform_type transform_type;
  
  if (expr[0] != 's')
    {
      last_transform_error.diag = _("invalid transform expression");
      last_transform_error.pos = 0;
      last_transform_error.arg = expr;
      return 1;
    }

  delim = expr[1];
  
  /* Scan regular expression */
  for (i = 2; expr[i]; )
    {
      if (expr[i] == '\\')
	{
	  i += (expr[i] == 0) ? 1 : 2;
	}
      else if (expr[i] == '[')
	{
	  i = skip_class (expr, i);
	}
      else if (expr[i] == delim)
	break;
      else
	i++;
    }

  if (expr[i] != delim)
    {
      last_transform_error.diag = _("missing replacement expression");
      last_transform_error.pos = i;
      last_transform_error.arg = expr;
      return 1;
    }
  
  /* Scan replacement expression */
  for (j = i + 1; expr[j] && expr[j] != delim; j++)
    if (expr[j] == '\\' && expr[j+1])
      j++;

  if (expr[j] != delim)
    {
      last_transform_error.diag = _("missing trailing delimiter");
      last_transform_error.pos = j;
      last_transform_error.arg = expr;
      return 1;
    }

  /* Check flags */
  transform_type = transform_first;
  for (p = expr + j + 1; *p && *p != ';'; p++)
    switch (*p)
      {
      case 'g':
	transform_type = transform_global;
	break;

      case 'i':
	cflags |= REG_ICASE;
	break;

      case 'x':
	cflags |= REG_EXTENDED;
	break;

      case '0': case '1': case '2': case '3': case '4':
      case '5': case '6': case '7': case '8': case '9':
	tf->match_number = strtoul (p, (char**) &p, 0);
	p--;
	break;

      default:
	last_transform_error.diag = _("unknown flag");
	last_transform_error.pos = p - expr;
	last_transform_error.arg = expr;
	return 1;
    }

  if (*p == ';')
    p++;

  /* Extract and compile regex */
  str = mu_alloc (i - 1);
  memcpy (str, expr + 2, i - 2);
  str[i - 2] = 0;

  rc = regcomp (&tf->regex, str, cflags);
  tf->transform_type = transform_type;
  if (rc)
    {
      char errbuf[512];
      regerror (rc, &tf->regex, errbuf, sizeof (errbuf));
      last_transform_error.diag = _("invalid transform expression");
      last_transform_error.pos = 0;
      last_transform_error.mem = mu_strdup(errbuf);
      last_transform_error.arg = last_transform_error.mem;
      free (str);
      return 1;
    }

  if (str[0] == '^' || str[strlen (str) - 1] == '$')
    tf->transform_type = transform_first;

  free (str);

  /* Extract and compile replacement expr */
  i++;
  str = mu_alloc (j - i + 1);
  memcpy (str, expr + i, j - i);
  str[j - i] = 0;

  for (cur = beg = str; *cur;)
    {
      if (*cur == '\\')
	{
	  size_t n;

	  add_literal_segment (tf, beg, cur);
	  switch (*++cur)
	    {
	    case '0': case '1': case '2': case '3': case '4':
	    case '5': case '6': case '7': case '8': case '9':
	      n = strtoul (cur, &cur, 10);
	      if (n > tf->regex.re_nsub)
		{
		  last_transform_error.diag = _("back reference out of range");
		  last_transform_error.pos = cur - str;
		  last_transform_error.mem = str;
		  last_transform_error.arg = last_transform_error.mem;
		  return 1;
		}
	      add_backref_segment (tf, n);
	      break;

	    case '\\':
	      add_char_segment (tf, '\\');
	      cur++;
	      break;

	    case 'a':
	      add_char_segment (tf, '\a');
	      cur++;
	      break;

	    case 'b':
	      add_char_segment (tf, '\b');
	      cur++;
	      break;

	    case 'f':
	      add_char_segment (tf, '\f');
	      cur++;
	      break;

	    case 'n':
	      add_char_segment (tf, '\n');
	      cur++;
	      break;

	    case 'r':
	      add_char_segment (tf, '\r');
	      cur++;
	      break;

	    case 't':
	      add_char_segment (tf, '\t');
	      cur++;
	      break;

	    case 'v':
	      add_char_segment (tf, '\v');
	      cur++;
	      break;

	    case '&':
	      add_char_segment (tf, '&');
	      cur++;
	      break;

	    case 'L':
	      /* Turn the replacement to lowercase until a `\U' or `\E'
		 is found, */
	      add_case_ctl_segment (tf, ctl_locase);
	      cur++;
	      break;

	    case 'l':
	      /* Turn the next character to lowercase, */
	      add_case_ctl_segment (tf, ctl_locase_next);
	      cur++;
	      break;

	    case 'U':
	      /* Turn the replacement to uppercase until a `\L' or `\E'
		 is found, */
	      add_case_ctl_segment (tf, ctl_upcase);
	      cur++;
	      break;

	    case 'u':
	      /* Turn the next character to uppercase, */
	      add_case_ctl_segment (tf, ctl_upcase_next);
	      cur++;
	      break;

	    case 'E':
	      /* Stop case conversion started by `\L' or `\U'. */
	      add_case_ctl_segment (tf, ctl_stop);
	      cur++;
	      break;

	    default:
	      add_char_segment (tf, *cur);
	      cur++;
	      break;
	    }
	  beg = cur;
	}
      else if (*cur == '&')
	{
	  add_literal_segment (tf, beg, cur);
	  add_backref_segment (tf, 0);
	  beg = ++cur;
	}
      else
	cur++;
    }
  add_literal_segment (tf, beg, cur);
  *endp = p;
  return 0;
}

transform_t
transform_compile (const char *expr, int cflags)
{
  struct transform_list tlist = { NULL, NULL };

  init_transform_error ();

  while (*expr)
    if (parse_transform_expr (&tlist, expr, cflags, &expr))
      {
	transform_free (tlist.head);
	return NULL;
      }
  return tlist.head;
}

transform_t
transform_join (transform_t dst, transform_t src)
{
  struct transform *tail;
  for (tail = dst; tail->next; tail = tail->next)
    ;
  tail->next = src;
  return dst;
}

/*
 * String buffer interface.
 *
 * String buffer is an expandable storage oriented on sequential
 * appends.
 */
struct string_buffer
{
  char *base;     /* Storage base pointer */
  size_t size;    /* Total size allocated for base */
  size_t length;  /* Length of the used portion of base */
};

/*
 * Initialize the string buffer structure.  Allocate SIZE bytes
 * for initial storage.
 */
static void
string_buffer_init (struct string_buffer *buf, size_t size)
{
  buf->base = mu_alloc (size);
  buf->size = size;
  buf->length = 0;
}

/*
 * Reset the buffer.  All data appended so far will be lost.
 */
static inline void
string_buffer_reset (struct string_buffer *buf)
{
  buf->length = 0;
}

/* Free the memory allocated for the buffer. */
static void
string_buffer_free (struct string_buffer *buf)
{
  free (buf->base);
}

/*
 * Append LEN bytes from TEXT to the string buffer BUF.
 */
static void
string_buffer_append (struct string_buffer *buf, char const *text, size_t len)
{
  while (buf->length + len > buf->size)
    {
      buf->base = mu_2nrealloc (buf->base, &buf->size, 1);
    }
  memcpy (buf->base + buf->length, text, len);
  buf->length += len;
}

/*
 * String I/O interface.
 *
 * A string_io structure holds two string buffers, which cyclically
 * change their roles.  The buffer serving as input in one round, serves
 * as output in the next one.
 */
struct string_io
{
  struct string_buffer buf[2];   /* String buffers */
  int in;  /* Index of the buffer serving for input */
};

/*
 * Initialize the string_io buffer.  Store SIZE bytes from INPUT plus a
 * zero byte in its input buffer.
 */
static inline void
string_io_init (struct string_io *bp, char const *input, size_t size)
{
  string_buffer_init (&bp->buf[0], size + 1);
  string_buffer_init (&bp->buf[1], size + 1);
  bp->in = 0;
  string_buffer_append (&bp->buf[0], input, size);
  string_buffer_append (&bp->buf[0], "", 1);
}

/* Free the memory allocated in string_io buffer */
static inline void
string_io_free (struct string_io *bp)
{
  string_buffer_free (&bp->buf[0]);
  string_buffer_free (&bp->buf[1]);
}

/* Return the input part of the string_io structure. */
static inline struct string_buffer *
string_io_input (struct string_io *bp)
{
  return &bp->buf[bp->in];
}

/* Return the output part of the string_io structure. */
static inline struct string_buffer *
string_io_output (struct string_io *bp)
{
  return &bp->buf[!bp->in];
}

/* Return the pointer to the string stored in the input part of the
   string_io structure. */
static inline char *
string_io_input_text (struct string_io *bp)
{
  return string_io_input (bp)->base;
}

/*
 * Swap the roles of buffers in BP.  Current output buffer becomes the
 * input one.  Current input buffer gets cleared out and becomes the
 * output buffer.
 */
static inline void
string_io_swap (struct string_io *bp)
{
  bp->in = ! bp->in;
  string_buffer_reset (string_io_output (bp));
}

/*
 * Append LEN bytes from TEXT to the output part of string_io.
 */
static inline void
string_io_append (struct string_io *bp, char const *text, size_t len)
{
  string_buffer_append (string_io_output (bp), text, len);
}

/*
 * Case control state.
 */
struct case_ctl_state
{
  enum case_ctl_type current;
  enum case_ctl_type saved;
};

#define CASE_CTL_STATE_INITIALIZER { ctl_stop, ctl_stop }

static inline enum case_ctl_type
case_ctl_state (struct case_ctl_state *st)
{
  return st->current;
}

static inline void
case_ctl_state_reset (struct case_ctl_state *st)
{
  st->current = st->saved;
  st->saved = ctl_stop;
}

static inline void
case_ctl_state_set (struct case_ctl_state *st, enum case_ctl_type type)
{
  switch (type)
    {
    case ctl_upcase_next:
    case ctl_locase_next:
      switch (st->saved)
	{
	case ctl_stop:
	case ctl_upcase:
	case ctl_locase:
	  st->saved = st->current;

	default:
	  break;
	}
      /*FALL THROUGH*/

    case ctl_upcase:
    case ctl_locase:
    case ctl_stop:
      st->current = type;
    }
}

/*
 * Run case conversion specified by STATE on array PTR of SIZE
 * characters and store the result in the output part of IOB.
 */
static void
run_case_conv (struct string_io *iob,
	       struct case_ctl_state *state,
	       const char *ptr, size_t size)
{
  size_t i;
  char *p;
  struct string_buffer *ob = string_io_output (iob);
  size_t n = ob->length;

  string_buffer_append (ob, ptr, size);
  p = ob->base + n;
  switch (case_ctl_state (state))
    {
    case ctl_upcase_next:
      p[0] = mu_toupper (p[0]);
      case_ctl_state_reset (state);
      break;

    case ctl_locase_next:
      p[0] = mu_tolower (p[0]);
      case_ctl_state_reset (state);
      break;

    case ctl_upcase:
      for (i = 0; i < size; i++)
	p[i] = mu_toupper (p[i]);
      break;

    case ctl_locase:
      for (i = 0; i < size; i++)
	p[i] = mu_tolower (p[i]);
      break;

    case ctl_stop:
      break;
    }
}

void
run_single_transform (struct transform *tf, struct string_io *iob)
{
  char const *input;
  regmatch_t *rmp;
  int rc;
  size_t nmatches = 0;
  struct case_ctl_state state = CASE_CTL_STATE_INITIALIZER;

  input = string_io_input_text (iob);
  rmp = mu_alloc ((tf->regex.re_nsub + 1) * sizeof (*rmp));

  while (*input)
    {
      size_t disp;

      rc = regexec (&tf->regex, input, tf->regex.re_nsub + 1, rmp, 0);

      if (rc == 0)
	{
	  struct replace_segm *segm;

	  disp = rmp[0].rm_eo;

	  nmatches++;
	  if (tf->match_number && nmatches < tf->match_number)
	    {
	      string_io_append (iob, input, disp);
	      input += disp;
	      continue;
	    }

	  if (rmp[0].rm_so)
	    string_io_append (iob, input, rmp[0].rm_so);

	  for (segm = tf->repl_head; segm; segm = segm->next)
	    {
	      switch (segm->type)
		{
		case segm_literal:    /* Literal segment */
		  run_case_conv (iob,
				 &state,
				 segm->v.literal.ptr,
				 segm->v.literal.size);
		  break;

		case segm_backref:    /* Back-reference segment */
		  if (rmp[segm->v.ref].rm_so != -1
		      && rmp[segm->v.ref].rm_eo != -1)
		    {
		      size_t size = rmp[segm->v.ref].rm_eo
				      - rmp[segm->v.ref].rm_so;
		      run_case_conv (iob,
				     &state,
				     input + rmp[segm->v.ref].rm_so,
				     size);
		    }
		  break;

		case segm_case_ctl:
		  case_ctl_state_set (&state, segm->v.ctl);
		}
	    }
	}
      else
	{
	  disp = strlen (input);
	  string_io_append (iob, input, disp);
	}

      input += disp;

      if (tf->transform_type == transform_first)
	{
	  string_io_append (iob, input, strlen (input));
	  break;
	}
    }
  string_io_append (iob, "", 1);
  free (rmp);
}

char *
transform_string (transform_t tf, const char *input)
{
  struct string_io iob;
  char *ret;

  string_io_init (&iob, input, strlen (input));
  if (tf)
    {
      for (; tf; tf = tf->next)
	{
	  run_single_transform (tf, &iob);
	  string_io_swap (&iob);
	}
    }
  ret = string_io_input_text (&iob);
  string_buffer_free (string_io_output (&iob));
  return ret;
}


/*
 Local Variables:
 c-file-style: "gnu"
 End:
*/
/* EOF */