Description: Port from pcre3 to pcre2
Bug-Debian: https://bugs.debian.org/1000000
Author: Andreas Tille <tille@debian.org>
Last-Update: Fri, 19 Nov 2021 13:12:51 +0100

--- a/include/phast/stringsplus.h
+++ b/include/phast/stringsplus.h
@@ -24,7 +24,8 @@
 #ifndef STRINGSPLUS_H
 #define STRINGSPLUS_H
 
-#include <pcre.h>
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
 #include "phast/lists.h"
 #include "stdio.h"
 
@@ -56,9 +57,6 @@ typedef struct {
   int nchars;			/**< Number of bytes currently allocated */
 } String;
 
-/** PCRE is another name for Regex */
-typedef pcre Regex;
-				
 /** \name String Allocate/Cleanup functions 
 \{ */
 
@@ -99,7 +97,7 @@ void str_clear(String *s);
 
 /** \} */
 
-/* void str_match(String *s, Regexp *r); */ /* tags? */
+/* void str_match(String *s, pcre2_compile_context *r); */ /* tags? */
 
 /** \name String Append functions */
 
@@ -385,13 +383,13 @@ int str_split(String *s, const char* del
    expression syntax.
    @result Newly allocated and compiled Regex object.
  */
-Regex *str_re_new(const char *re_str);
+pcre2_code *str_re_new(const unsigned char *re_str);
 
 /** Free resources associated with regular expression object. 
     @param re Regex object to free
     @note The object itself is freed also. 
 */
-void str_re_free(Regex *re);
+void str_re_free(pcre2_compile_context *re);
 
 /** Test whether the specified string matches the specified regex.
    @pre The list 'l' must be initialized externally if non-NULL.  
@@ -403,11 +401,11 @@ void str_re_free(Regex *re);
    on no match, and -2 on error.
    @note NULLs will be added for all non-matching groups in list 'l'
    @note In the list 'l', the 0th substring corresponds to the entire regex. 
-   @note This function uses the pcre_exec function of the PCRE
+   @note This function uses the pcre2_exec function of the PCRE2
    regex package.
    @warning Substrings added to List l are newly allocated and must be
    freed externally. */
-int str_re_match(String *s, Regex *re, List *l, int nsubexp);
+int str_re_match(String *s, pcre2_compile_context *re, List *l, int nsubexp);
 
 /** Search the specified string for the first instance of the specified
    regex.  
@@ -415,14 +413,14 @@ int str_re_match(String *s, Regex *re, L
    @param start_offset The first start_offset characters will be ignored.
    @param l (Optional) If non-NULL, it will be populated with substrings corresponding
    to subexpressions, as described under str_re_match.  
-   @note This function uses the pcre_exec function of the PCRE regex package.
+   @note This function uses the pcre2_exec function of the PCRE2 regex package.
    @result Index of first match, -1 if no match exists, or -2 if an
    internal error occurs. 
    @warning Substrings added to List l are newly allocated and must be
    freed externally. 
    @see str_re_match
 */
-int str_re_search(String *s, Regex *re, int start_offset, List *l, 
+int str_re_search(String *s, pcre2_compile_context *re, int start_offset, List *l, 
                   int nsubexp);
 
 /** \} */
--- a/src/lib/base/phast_stringsplus.c
+++ b/src/lib/base/phast_stringsplus.c
@@ -12,7 +12,8 @@
    
    $Id: stringsplus.c,v 1.12 2009-02-19 23:33:48 agd27 Exp $ */
 
-#include <pcre.h>
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
 #include "phast/stringsplus.h"
 #include "phast/misc.h"
 #include <stdlib.h>
@@ -462,47 +463,51 @@ int str_ends_with_charstr(String *s, con
   return (strncmp(&s->chars[s->length - len], substr, len) == 0);
 }
 
-Regex *str_re_new(const char *re_str) {
-  Regex *re;
-  const char *errstr;
-  int erroffset;
+pcre2_code *str_re_new(const unsigned char *re_str) {
+  pcre2_code *re;
+  int errorcode;
+  PCRE2_SIZE erroffset;
 
-  re = pcre_compile(re_str, 0, &errstr, &erroffset, NULL);
+  re = pcre2_compile(re_str, PCRE2_ZERO_TERMINATED, 0, &errorcode, &erroffset, NULL);
   if (re == NULL) {
-    die("ERROR: cannot compile regular expression '%s' (%d): %s\n",
-	re_str, erroffset, errstr);
+    die("ERROR: cannot compile regular expression '%s' (%d): %d\n",
+	re_str, erroffset, errorcode);
   }
   return re;
 }
 
 
-//NOTE Regex are allocated by pcre; do not use sfree
-void str_re_free(Regex *re) {
+//NOTE Regex are allocated by pcre2; do not use sfree
+void str_re_free(pcre2_compile_context *re) {
   if (re != NULL)
     free(re);
 }
 
 
 #define OVECCOUNT 300
-int str_re_match_sub(String *s, Regex *re, List *l, int offset, int nsubexp, 
+int str_re_match_sub(String *s, pcre2_code *re, List *l, int offset, int nsubexp, 
 		     int *first_match) {
-  int i, len, rc, ovector[OVECCOUNT], rv;
+  int i, len, rc, rv;
+  PCRE2_SIZE *ovector;
+
   String *substr;
+  pcre2_match_data *match_data = pcre2_match_data_create(OVECCOUNT, NULL);
 
   /* WARNING: lst_clear DOES NOT free memory associated with the contents,
      so must free substrings from previous calls if these are no longer being
      used or there will be a memory leak! */
   if (l != NULL) lst_clear(l);
 
-  rc = pcre_exec(re, NULL, s->chars, s->length, offset, 0, ovector, OVECCOUNT);
-  if (rc == PCRE_ERROR_NOMATCH) return -1;
+  rc = pcre2_match(re, s->chars, s->length, offset, 0, match_data, NULL);
+  if (rc == PCRE2_ERROR_NOMATCH) return -1;
   if (rc < 0) return -2;  //any other error
   if (first_match != NULL) (*first_match) = ovector[0];
+  ovector = pcre2_get_ovector_pointer(match_data);
   rv = ovector[1]-ovector[0];
   if (rc >= 0 && l != NULL) {
     if (rc == 0) {
       printf("nsubexp=%i rc=%i\n", nsubexp, rc);
-      fprintf(stderr, "Warning: pcre_exec only has room for %d captured substrings.  May need to increase OVECCOUNT and re-compile\n", OVECCOUNT/3);
+      fprintf(stderr, "Warning: pcre2_exec only has room for %d captured substrings.  May need to increase OVECCOUNT and re-compile\n", OVECCOUNT/3);
       rc = OVECCOUNT/3;
     }
     for (i = 0; i < rc && i <= nsubexp; i++) {
@@ -523,11 +528,11 @@ int str_re_match_sub(String *s, Regex *r
 }
 
 
-int str_re_match(String *s, Regex *re, List *l, int nsubexp) {
+int str_re_match(String *s, pcre2_compile_context *re, List *l, int nsubexp) {
   return str_re_match_sub(s, re, l, 0, nsubexp, NULL);
 }
 
-int str_re_search(String *s, Regex *re, int start_offset, List *l,
+int str_re_search(String *s, pcre2_compile_context *re, int start_offset, List *l,
 		  int nsubexp) {
   int first_match_idx, rc;
   rc = str_re_match_sub(s, re, l, start_offset, nsubexp, &first_match_idx);
--- a/src/make-include.mk
+++ b/src/make-include.mk
@@ -137,7 +137,7 @@ LIBS = -lphast -framework Accelerate -lc
 else
 ifdef CLAPACKPATH
 ifneq ($(TARGETOS), Windows)
-  LIBS = -lphast -llapack -ltmglib -lblas -lc -lm -lpcre $(LDFLAGS)
+  LIBS = -lphast -llapack -ltmglib -lblas -lc -lm -lpcre2-8 $(LDFLAGS)
 else
   CFLAGS += -I${CLAPACKPATH}/INCLUDE -I${F2CPATH} -DPCRE_STATIC
   LIBS = -lphast -lm  ${CLAPACKPATH}/liblapack.a ${CLAPACKPATH}/libf2c.a ${CLAPACKPATH}/libblas.a
--- a/src/dless/dlessP.c
+++ b/src/dless/dlessP.c
@@ -196,7 +196,7 @@ void do_p_values(BDPhyloHmm *bdphmm, GFF
   JumpProcess *jp;
   List *types = lst_new_ptr(nnodes * 2), *type_lists = lst_new_ptr(nnodes * 2);
   TreeModel *mod = bdphmm->phmm->mods[0]; /* nonconserved */
-  Regex *id_re = str_re_new(".*id \"([^\"]*)\"");
+  pcre2_compile_context *id_re = str_re_new(".*id \"([^\"]*)\"");
   String *id = str_new(STR_SHORT_LEN);
   List *l = lst_new_ptr(1);
 
--- a/src/lib/base/phast_misc.c
+++ b/src/lib/base/phast_misc.c
@@ -669,7 +669,7 @@ int draw_index(double *p, int size) {
    character as well as "->" to indicate mapping.  */
 struct hash_table *make_name_hash(char *mapstr) {
   Hashtable *retval = hsh_new(20);
-  Regex *map_re = str_re_new("^[[:space:]]*([A-Za-z0-9_]+)[[:space:]]*(->|=)[[:space:]]*([A-Za-z0-9_]+)[[:space:]]*");
+  pcre2_compile_context *map_re = str_re_new("^[[:space:]]*([A-Za-z0-9_]+)[[:space:]]*(->|=)[[:space:]]*([A-Za-z0-9_]+)[[:space:]]*");
   List *mappings = lst_new_ptr(20), *names = lst_new_ptr(3);
   String *s = str_new_charstr(mapstr);
   int i;
--- a/src/lib/feature/phast_bed.c
+++ b/src/lib/feature/phast_bed.c
@@ -140,7 +140,7 @@ void gff_print_bed(FILE *OUTF,  GFF_Set
   if (lst_size(gff->features) == 0) return; /* now can assume at least one feature */
 
   if (!use_groups) {
-    Regex *tag_val_re = str_re_new("[[:alnum:]_.]+[[:space:]]+(\"[^\"]*\"|[^[:space:]]+)");
+    pcre2_compile_context *tag_val_re = str_re_new("[[:alnum:]_.]+[[:space:]]+(\"[^\"]*\"|[^[:space:]]+)");
     List *l = lst_new_ptr(2);
     int ncols = 4;
 
--- a/src/lib/feature/phast_category_map.c
+++ b/src/lib/feature/phast_category_map.c
@@ -26,11 +26,11 @@ CategoryMap *cm_read(FILE *F) {
   int cat, cat2, lineno, i, cm_read_error;
   CategoryMap *cm = NULL;
   CategoryRange *existing_range;
-  static Regex *cat_range_re = NULL;
-  static Regex *ncats_re = NULL;
-  static Regex *fill_re = NULL;
-  static Regex *label_re = NULL;
-  static Regex *extend_re = NULL;
+  static pcre2_compile_context *cat_range_re = NULL;
+  static pcre2_compile_context *ncats_re = NULL;
+  static pcre2_compile_context *fill_re = NULL;
+  static pcre2_compile_context *label_re = NULL;
+  static pcre2_compile_context *extend_re = NULL;
   int has_dependencies = 0;
 
   line = str_new(STR_SHORT_LEN);
--- a/src/lib/feature/phast_gff.c
+++ b/src/lib/feature/phast_gff.c
@@ -38,7 +38,7 @@ GFF_Set* gff_read_set(FILE *F) {
   GFF_Feature *feat;
   GFF_Set *set;
   List *l, *substrs;
-  static Regex *spec_comment_re = NULL;
+  static pcre2_compile_context *spec_comment_re = NULL;
 
   line = str_new(STR_LONG_LEN);
   set = gff_new_set();
@@ -267,7 +267,7 @@ GFF_Feature *gff_new_feature_genomic_pos
                                          int score_is_null) {
   GFF_Feature *retval = NULL;
   List *substrs = lst_new_ptr(4);
-  static Regex *posre = NULL;
+  static pcre2_compile_context *posre = NULL;
   if (posre == NULL)
     posre = str_re_new("(chr[_a-zA-Z0-9]+):([0-9]+)-([0-9]+)([-+])?");
 
@@ -667,7 +667,7 @@ void gff_sort_within_groups(GFF_Set *set
     undefined values will be placed in a single group. */
 void gff_group(GFF_Set *set, char *tag) {
   char *tmpstr=smalloc((100+strlen(tag))*sizeof(char));
-  Regex *tag_re;
+  pcre2_compile_context *tag_re;
   List *l = lst_new_ptr(1);
   int est_no_groups = max(lst_size(set->features) / 10, 1);
   Hashtable *hash = hsh_new(est_no_groups);
--- a/src/lib/motif/phast_tfbs.c
+++ b/src/lib/motif/phast_tfbs.c
@@ -137,8 +137,8 @@ List *pwm_read(const char *filename) {
   List *l = lst_new_ptr(3);
   List *probabilitiesStr = lst_new_ptr(4);
   List *probabilitiesDbl;
-  Regex *pssm_re = NULL;
-  Regex *motif_name_re = NULL;
+  pcre2_compile_context *pssm_re = NULL;
+  pcre2_compile_context *motif_name_re = NULL;
   int alphabetLength;
 
   result = lst_new_ptr(1);
@@ -215,7 +215,7 @@ int ms_alph_has_lowercase(MS *ms) {
 MS *ms_read(const char *filename, const char *alphabet) {
   List *names = lst_new_ptr(10);
   List *seqs = lst_new_ptr(10);
-  static Regex *descrip_re = NULL;
+  static pcre2_compile_context *descrip_re = NULL;
   int i, nseqs, j, do_toupper, line_no;
   String *line = str_new(STR_MED_LEN);
   List *l = lst_new_ptr(2);
--- a/src/lib/msa/phast_local_alignment.c
+++ b/src/lib/msa/phast_local_alignment.c
@@ -47,7 +47,7 @@ LocalPwAlignment *la_read_lav(FILE *F, i
   int line_no=0;
   LocalPwAlignment *lpwa = la_new();
   List *fields = lst_new_ptr(6);
-  Regex *stanza_start_re = str_re_new("^([dshaxm])[[:space:]]*{");
+  pcre2_compile_context *stanza_start_re = str_re_new("^([dshaxm])[[:space:]]*{");
   AlignmentBlock *aln_block = NULL;
   char stanza_type = '\0';
   int i;
--- a/src/lib/msa/phast_msa.c
+++ b/src/lib/msa/phast_msa.c
@@ -253,7 +253,7 @@ MSA *msa_create_copy(MSA *msa, int suff_
 MSA *msa_read_fasta(FILE *F, char *alphabet) {
   List *names = lst_new_ptr(10);
   List *seqs = lst_new_ptr(10);
-  static Regex *descrip_re = NULL;
+  static pcre2_compile_context *descrip_re = NULL;
   int maxlen, i, nseqs, j, do_toupper, line_no;
   String *line = str_new(STR_MED_LEN);
   List *l = lst_new_ptr(2);
@@ -1921,7 +1921,7 @@ GFF_Set *msa_get_informative_feats(MSA *
 
 /* read and return a single sequence from a FASTA file */
 String *msa_read_seq_fasta(FILE *F) {
-  static Regex *descrip_re = NULL;
+  static pcre2_compile_context *descrip_re = NULL;
   String *line = str_new(STR_MED_LEN);
   String *seq = NULL;
 
@@ -2581,7 +2581,7 @@ msa_format_type msa_format_for_content(F
   msa_format_type retval = UNKNOWN_FORMAT;
   String *line = str_new(STR_MED_LEN);
   List *matches = lst_new_ptr(3);
-  Regex *ss_re, *phylip_re, *fasta_re, *lav_re, *maf_re;  
+  pcre2_compile_context *ss_re, *phylip_re, *fasta_re, *lav_re, *maf_re;  
   
   //using peek instead of read as we don't want to affect file/stream position
   str_peek_next_line(line, F);
--- a/src/lib/msa/phast_multi_msa.c
+++ b/src/lib/msa/phast_multi_msa.c
@@ -51,9 +51,9 @@
     abort if the sequence contains a character not in the alphabet. */
 Multi_MSA *multimsa_new_from_files(FILE *F) {
 
-  Regex *blocks_re = str_re_new("#[[:space:]]*BLOCKS[[:space:]]*=[[:space:]]*([0-9]+)");
-  Regex *alph_re = str_re_new("#[[:space:]]*ALPHABET[[:space:]]*=[[:space:]]*([A-Z]+)");
-  Regex *format_re = str_re_new("#[[:space:]]*FORMAT[[:space:]]*=[[:space:]]*([A-Z]+)");
+  pcre2_compile_context *blocks_re = str_re_new("#[[:space:]]*BLOCKS[[:space:]]*=[[:space:]]*([0-9]+)");
+  pcre2_compile_context *alph_re = str_re_new("#[[:space:]]*ALPHABET[[:space:]]*=[[:space:]]*([A-Z]+)");
+  pcre2_compile_context *format_re = str_re_new("#[[:space:]]*FORMAT[[:space:]]*=[[:space:]]*([A-Z]+)");
   
   int i, num_msa, line_no=0;
   char *msa_fname;
--- a/src/lib/msa/phast_sufficient_stats.c
+++ b/src/lib/msa/phast_sufficient_stats.c
@@ -649,7 +649,7 @@ void ss_write(MSA *msa, FILE *F, int sho
 /* make reading order optional?  alphabet argument overrides alphabet
    in file (use NULL to use version in file) */
 MSA* ss_read(FILE *F, char *alphabet) {
-  Regex *nseqs_re, *length_re, *tuple_size_re, *ntuples_re, *tuple_re, 
+  pcre2_compile_context *nseqs_re, *length_re, *tuple_size_re, *ntuples_re, *tuple_re, 
     *names_re, *alph_re, *ncats_re, *order_re, *offset_re;
   String *line, *alph = NULL;
   int nseqs, length, tuple_size, ntuples, i, ncats = -99, header_done = 0, 
--- a/src/lib/phylo/phast_phylo_p_print.c
+++ b/src/lib/phylo/phast_phylo_p_print.c
@@ -749,7 +749,7 @@ void print_feats_generic(FILE *outfile,
   String *name;
   va_list ap;
   double *data[ncols+1];
-  Regex *tag_val_re = str_re_new("[[:alnum:]_.]+[[:space:]]+(\"[^\"]*\"|[^[:space:]]+)");
+  pcre2_compile_context *tag_val_re = str_re_new("[[:alnum:]_.]+[[:space:]]+(\"[^\"]*\"|[^[:space:]]+)");
   List *l = lst_new_ptr(2);
   char **colname;
   List **resultList=NULL;
--- a/src/prequel/phast_pbs_code.c
+++ b/src/prequel/phast_pbs_code.c
@@ -85,7 +85,7 @@ void pbs_free(PbsCode *code) {
 }
 
 PbsCode *pbs_new_from_file(FILE *F) {
-  Regex *nrows_re = str_re_new("##NROWS[[:space:]]*=[[:space:]]*([0-9]+)"),
+  pcre2_compile_context *nrows_re = str_re_new("##NROWS[[:space:]]*=[[:space:]]*([0-9]+)"),
     *dimension_re = str_re_new("##DIMENSION[[:space:]]*=[[:space:]]*([0-9]+)"),
     *nbytes_re = str_re_new("##NBYTES[[:space:]]*=[[:space:]]*([0-9]+)"),
     *codesize_re = str_re_new("##CODESIZE[[:space:]]*=[[:space:]]*([0-9]+)");
--- a/src/util/msa_view.c
+++ b/src/util/msa_view.c
@@ -358,7 +358,7 @@ OPTIONS:\n\
 
 void fill_with_Ns(MSA *msa, List *fill_N_list, msa_coord_map *map) {
   int i, j, nseq, nstart, nend;
-  Regex* fill_N_re = str_re_new("([[:digit:]]+):([[:digit:]]+)-([[:digit:]]+)");
+  pcre2_compile_context* fill_N_re = str_re_new("([[:digit:]]+):([[:digit:]]+)-([[:digit:]]+)");
   List *word_list = lst_new_ptr(4);
   for (i = 0; i < lst_size(fill_N_list); i++) {
     String *s = lst_get_ptr(fill_N_list, i);
