From: bct <bct.x42@gmail.com>
Date: Sun, 23 Jan 2022 23:18:21 +0530
Subject: Patch to build with pcre2

---
 README              | 20 ++++++++++++++++++++
 configure.ac        | 25 ++++++++++++++++++++-----
 src/pull_by_re.c    | 47 +++++++++++++++++++++++++++++++++++++----------
 src/pull_by_re.h    |  4 +---
 src/pullseq.c       | 44 +++-----------------------------------------
 src/search_header.c | 21 ++++++++++++++-------
 src/search_header.h |  6 ++++--
 7 files changed, 99 insertions(+), 68 deletions(-)

diff --git a/README b/README
index b720bae..64fa546 100644
--- a/README
+++ b/README
@@ -100,3 +100,23 @@ INSTALL:
   ./configure  # configure the application based on your system
   make         # will build the application
   make install # will install in /usr/local by default
+
+  NOTE: If you have PCRE (perl-compatible regular expression library)
+  installed in a non-standard location (e.g. on a mac using brew), the
+  ./configure script will fail. You'll need to update your CFLAGS and
+  LDFLAGS env settings to define where your PCRE library files were
+  installed.
+
+  For example, on a mac with pcre installed by brew, you can do this:
+
+  pcre-config --cflags
+  -I/usr/local/Cellar/pcre/8.39/include
+  
+  Then you can just add this to a env CFLAGS variable and run the
+  configure command, like so...
+
+  export CFLAGS="-I/usr/local/Cellar/pcre/8.39/include"
+  ./configure
+
+  If your pcre library is installed somewhere else, you just update
+  the CFLAGS env variable accordingly.
diff --git a/configure.ac b/configure.ac
index b8a0e95..2cc154f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,7 +2,7 @@
 # Process this file with autoconf to produce a configure script.
 
 AC_PREREQ([2.69])
-AC_INIT([pullseq], [1.0.1], [bct.x42@gmail.com])
+AC_INIT([pullseq], [1.1.0], [bct.x42@gmail.com])
 #AC_CONFIG_AUX_DIR(config)
 AM_INIT_AUTOMAKE([foreign])
 AC_CONFIG_SRCDIR([src/pullseq.c])
@@ -13,11 +13,27 @@ AC_PROG_CC
 
 # Checks for libraries.
 AC_CHECK_LIB([z],[gzopen])
-AC_CHECK_LIB([pcre],[pcre_compile])
+AC_CHECK_LIB([pcre2-8],[pcre2_config_8], [], [
+              echo "  Error! You need to have libpcre2 installed."
+              echo "  If you have PCRE2 installed in a non-standard"
+              echo "  location (e.g. as with brew on a mac), you'll"
+              echo "  need to update the CFLAGS/LDFLAGS env"
+              echo "  variables. See the README for more details."
+              exit -1
+              ])
 
 # Checks for header files.
-AC_CHECK_HEADERS([inttypes.h limits.h stddef.h stdlib.h string.h])
-AC_CHECK_HEADERS([pcre.h])
+AC_CHECK_HEADERS([pcre2.h inttypes.h limits.h stddef.h stdlib.h string.h])
+
+# this no longer works, but the above does... will debug later
+#AC_CHECK_HEADERS([pcre2.h], [], [
+#                  echo "  Error! You need to have libpcre2 headers installed."
+#                  echo "  If you have PCRE2 installed in a non-standard"
+#                  echo "  location (e.g. as with brew on a mac), you'll"
+#                  echo "  need to update the CFLAGS/LDFLAGS env"
+#                  echo "  variables. See the README for more details."
+#                  exit -1
+#                 ])
 
 # Checks for typedefs, structures, and compiler characteristics.
 AC_C_INLINE
@@ -37,4 +53,3 @@ AC_CHECK_FUNCS([strlen memset strchr strerror strtol])
 AC_CONFIG_FILES([Makefile src/Makefile])
 
 AC_OUTPUT
-
diff --git a/src/pull_by_re.c b/src/pull_by_re.c
index 1da147c..9632797 100644
--- a/src/pull_by_re.c
+++ b/src/pull_by_re.c
@@ -1,9 +1,11 @@
+#define PCRE2_CODE_UNIT_WIDTH 8
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <zlib.h>
 #include <errno.h>
-#include <pcre.h>
+#include <pcre2.h>
 
 #include "pull_by_re.h"
 #include "file_read.h"
@@ -18,13 +20,18 @@ __KSEQ_READ(static)
 extern char const *progname;
 extern int verbose_flag;
 
-int pull_by_re(char *input_file, pcre *re, pcre_extra *re_extra, int min, int max, int length, int exclude, int convert, int just_count) {
+int pull_by_re(char *input_file, char *aStrRegex, int min, int max, int length, int exclude, int convert, int just_count) {
 	gzFile fp;
 	int count=0,l;
 	int excluded = 0;
 	int is_fasta = 0; /* assume fastq */
 	kseq_t *seq;
 
+	/* pcre2 variables */
+	pcre2_code *re; // the regex object
+	PCRE2_SIZE erroroffset;
+	int errornumber;
+
 	/* open fasta file */
 	fp = gzopen(input_file,"r");
 	if (!fp) {
@@ -40,30 +47,50 @@ int pull_by_re(char *input_file, pcre *re, pcre_extra *re_extra, int min, int ma
 	gzrewind(fp); 
 	kseq_rewind(seq); /* rewind to beginning for main loop */
 
-    if (verbose_flag) {
-        if (is_fasta)
-            fprintf(stderr, "Input is FASTA format\n");
-        else
-            fprintf(stderr, "Input is FASTQ format\n");
-    }
+        if (verbose_flag) {
+            if (is_fasta)
+                fprintf(stderr, "Input is FASTA format\n");
+            else
+                fprintf(stderr, "Input is FASTQ format\n");
+        }
+
+	/* initialize the re */
+	re = pcre2_compile(
+			aStrRegex,             /* the pattern */
+			PCRE2_ZERO_TERMINATED, /* indicates pattern is zero-terminated */
+			0,                     /* default options */
+			&errornumber,          /* for error num */
+			&erroroffset,          /* err offset */
+			NULL);                 /* default compile context */
+
+	if (re == NULL) {
+		  PCRE2_UCHAR buffer[256];
+		  pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
+		  fprintf(stderr, "PCRE2 compilation failed at offset %d: %s\n", (int)erroroffset, buffer);
+		  exit(EXIT_FAILURE);
+	}
 
 	/* search through list and see if this header matches */
 	while((l = kseq_read(seq)) >= 0) {
 		if (exclude) {
-			if (search_header(re, re_extra, seq->name.s) || search_header(re, re_extra, seq->comment.s))
+			if (search_header(re, seq->name.s) || search_header(re, seq->comment.s))
 				excluded++;
 			else {
 				/* regex doesn't match, so check size/print */
 				count += size_filter(seq, is_fasta, min, max, length, convert, just_count);
 			}
 		} else {
-			if (search_header(re, re_extra, seq->name.s) || search_header(re, re_extra, seq->comment.s)) {
+			if (search_header(re, seq->name.s) || search_header(re, seq->comment.s)) {
 				/* regex matches so check size/print */
 				count += size_filter(seq, is_fasta, min, max, length, convert, just_count);
 			} else
 				excluded++;
 		}
 	} /* end of seq traversal */
+
+	/* tear down re */
+	pcre2_code_free(re); /* free up the re */
+
 	kseq_destroy(seq);
 	gzclose(fp); /* done reading file so close */
 
diff --git a/src/pull_by_re.h b/src/pull_by_re.h
index bd86f8c..845c362 100644
--- a/src/pull_by_re.h
+++ b/src/pull_by_re.h
@@ -1,8 +1,6 @@
 #ifndef PULL_BY_SIZE_H
 #define PULL_BY_SIZE_H
 
-#include <pcre.h>
-
-int pull_by_re(char *input_file, pcre *re, pcre_extra *re_extra, int min, int max, int length, int exclude, int convert, int just_count);
+int pull_by_re(char *input_file, char *aStrRegex, int min, int max, int length, int exclude, int convert, int just_count);
 
 #endif
diff --git a/src/pullseq.c b/src/pullseq.c
index 505ac32..870c417 100644
--- a/src/pullseq.c
+++ b/src/pullseq.c
@@ -4,7 +4,6 @@
 #include <zlib.h>
 #include <errno.h>
 #include <getopt.h>
-#include <pcre.h>
 
 #include "global.h"
 #include "pullseq.h"
@@ -59,10 +58,6 @@ int main(int argc, char *argv[]) {
 	int length = 50;
 	long value;
 	char *end;
-	pcre *reCompiled = NULL;
-	pcre_extra *pcreExtra;
-	const char *pcreErrorStr;
-	int pcreErrorOffset;
 	char *aStrRegex = NULL;
 
 	extern char *optarg; /* external from getopt */
@@ -265,32 +260,6 @@ int main(int argc, char *argv[]) {
 		}
 	}
 
-	/* if a regex search was requested, set up the pcre engine */
-	if (aStrRegex) {
-		// First, the regex string must be compiled. Support extended
-		// regex strings (e.g. m//x)
-		reCompiled = pcre_compile(aStrRegex, PCRE_CASELESS, &pcreErrorStr, &pcreErrorOffset, NULL);
-
-		if (reCompiled == NULL) {
-			fprintf(stderr, "ERROR: Could not compile '%s': %s\n", aStrRegex, pcreErrorStr);
-			exit(EXIT_FAILURE);
-		} else {
-			if (verbose_flag)
-				fprintf(stderr, "Successfully compiled regex from string '%s'\n", aStrRegex);
-		}
-
-		// Optimize the regex
-		pcreExtra = pcre_study(reCompiled, 0, &pcreErrorStr);
-
-		/* pcre_study() returns NULL for both errors and when it can not optimize the regex.
-		   The last argument is how one checks for errors (it is NULL if everything works,
-		   and points to an error string otherwise. */
-		if(pcreErrorStr != NULL) {
-			printf("ERROR: Could not study regex string '%s': %s\n", aStrRegex, pcreErrorStr);
-			exit(EXIT_FAILURE);
-		}
-	}
-
 	if (names || names_from_stdin) {
 		if (names) {
 			names_fp = fopen(names,"r");
@@ -302,8 +271,8 @@ int main(int argc, char *argv[]) {
 			names_fp = stdin;
 		}
 		count = pull_by_name(in, names_fp, min, max, length, exclude, convert, just_count);
-	} else if (reCompiled) {
-		count = pull_by_re(in, reCompiled, pcreExtra, min, max, length, exclude, convert, just_count);
+	} else if (aStrRegex) {
+		count = pull_by_re(in, aStrRegex, min, max, length, exclude, convert, just_count);
 	} else {
 		count = pull_by_size(in, min, max, length, convert, just_count);
 	}
@@ -317,15 +286,8 @@ int main(int argc, char *argv[]) {
 	if (names_fp)
 		fclose(names_fp);
 
-	if (aStrRegex) {
+	if (aStrRegex)
 		free(aStrRegex);
-		// Free up the regular expression.
-		pcre_free(reCompiled);
-
-		// Free up the EXTRA PCRE value (may be NULL at this point)
-		if (pcreExtra != NULL)
-			pcre_free(pcreExtra);
-	}
 
 	if (verbose_flag)
 		fprintf(stderr,"Pulled %i entries\n",count);
diff --git a/src/search_header.c b/src/search_header.c
index bf1e27c..29298b1 100644
--- a/src/search_header.c
+++ b/src/search_header.c
@@ -1,29 +1,36 @@
+#define PCRE2_CODE_UNIT_WIDTH 8
+
 #include <stdio.h>
 #include <string.h>
-#include <pcre.h>
+#include <pcre2.h>
 
 #include "search_header.h"
 #include "global.h"
 
 
-int search_header(pcre *re, pcre_extra *re_extra, char *str) {
+/* re is a compiled pcre2 regex */
+int search_header(pcre2_code *re, char *str) {
 	int pcreExecRet;
-	int pcre_ovector[MAX_CAPTURE_COUNT];
+	pcre2_match_data *match_data;
 
 	if (str == NULL) {
 		return 0;
 	}
+	match_data = pcre2_match_data_create_from_pattern(re, NULL); // init structure for result
 
-	pcreExecRet = pcre_exec(re, re_extra,
+	/* run the match */
+	pcreExecRet = pcre2_match(re,
 			str, 
 			strlen(str),    // length of header string
 			0,                      // Start looking at this point
 			0,                      // pcre exec OPTIONS
-			pcre_ovector,           // capture groups vector
-			MAX_CAPTURE_COUNT);     // Length of output capture groups
+			match_data, // pcre2_match_data
+			NULL); // default match context
+
+	pcre2_match_data_free(match_data);   /* Release memory used for the match */
 
 	if (pcreExecRet < 0) {
-		if (pcreExecRet == PCRE_ERROR_NOMATCH)
+		if (pcreExecRet == PCRE2_ERROR_NOMATCH)
 			return 0;
 		else
 			fprintf(stderr, "Problem with your regex (%d)\n", pcreExecRet);
diff --git a/src/search_header.h b/src/search_header.h
index b8d6b05..23f36e1 100644
--- a/src/search_header.h
+++ b/src/search_header.h
@@ -1,10 +1,12 @@
 #ifndef SEARCH_HEADER_H
 #define SEARCH_HEADER_H
 
-#include <pcre.h>
+#define PCRE2_CODE_UNIT_WIDTH 8
+
+#include <pcre2.h>
 
 #define MAX_CAPTURE_COUNT 30
 
-int search_header(pcre *re, pcre_extra *re_extra, char *str);
+int search_header(pcre2_code *re, char *str);
 
 #endif
