r1905 - trunk/grep

ken at linuxfromscratch.org ken at linuxfromscratch.org
Tue Feb 19 09:27:21 PST 2008


Author: ken
Date: 2008-02-19 10:27:21 -0700 (Tue, 19 Feb 2008)
New Revision: 1905

Added:
   trunk/grep/grep-2.5.3-debian_fixes-1.patch
   trunk/grep/grep-2.5.3-upstream_fixes-1.patch
Log:
Patches to build a better grep-2.5.3.

Added: trunk/grep/grep-2.5.3-debian_fixes-1.patch
===================================================================
--- trunk/grep/grep-2.5.3-debian_fixes-1.patch	                        (rev 0)
+++ trunk/grep/grep-2.5.3-debian_fixes-1.patch	2008-02-19 17:27:21 UTC (rev 1905)
@@ -0,0 +1,1109 @@
+Submitted By: Ken Moffat <ken at linuxfromscratch dot org>
+Date: 2008-02-19
+Initial Package Version: 2.5.3
+Upstream Status: uncertain
+Origin: from debian.
+Description: Various fixes, particularly speed improvements for UTF-8 locales.
+Also adds a 'standard input' marker into the results for certain obscure uses.
+
+diff -Naur grep-2.5.3.orig/lib/posix/regex.h grep-2.5.3.lfs/lib/posix/regex.h
+--- grep-2.5.3.orig/lib/posix/regex.h	2007-06-28 19:57:18.000000000 +0100
++++ grep-2.5.3.lfs/lib/posix/regex.h	2008-02-10 18:56:07.000000000 +0000
+@@ -165,6 +165,10 @@
+    treated as 'a\{1'.  */
+ #define RE_INVALID_INTERVAL_ORD (RE_DEBUG << 1)
+ 
++/* If this bit is set, then ignore case when matching.
++   If not set, then case is significant.  */
++#define RE_ICASE (RE_INVALID_INTERVAL_ORD << 1)
++
+ /* This global variable defines the particular regexp syntax to use (for
+    some interfaces).  When a regexp is compiled, the syntax used is
+    stored in the pattern buffer, so changing this does not affect
+diff -Naur grep-2.5.3.orig/src/dfa.c grep-2.5.3.lfs/src/dfa.c
+--- grep-2.5.3.orig/src/dfa.c	2007-06-28 19:57:19.000000000 +0100
++++ grep-2.5.3.lfs/src/dfa.c	2008-02-10 18:55:29.000000000 +0000
+@@ -594,6 +594,17 @@
+ 		/* build character class.  */
+ 		{
+ 		  wctype_t wt;
++		  /* NOTE:
++		   * when case_fold, character class [:upper:] and [:lower:]
++		   * should be treated as [:alpha:], this is the same way
++		   * of glibc/posix/regcomp.c:build_charclass().
++		   * reported by Bug#276202
++		   * - fixed by Fumitoshi UKAI
++		   */
++		  if (case_fold 
++		      && (strcmp (str, "upper") == 0 || strcmp (str, "lower") == 0)) 
++		      strcpy (str, "alpha");
++
+ 		  /* Query the character class as wctype_t.  */
+ 		  wt = wctype (str);
+ 
+@@ -681,6 +692,29 @@
+ 	  REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
+ 			       range_ends_al, work_mbc->nranges + 1);
+ 	  work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2;
++	  if (case_fold 
++	      && (iswlower((wint_t)wc) || iswupper((wint_t)wc))
++	      && (iswlower((wint_t)wc2) || iswupper((wint_t)wc2))) {
++	    wint_t altcase;
++	    altcase = wc;
++	    if (iswlower((wint_t)wc))
++	      altcase = towupper((wint_t)wc);
++	    else
++	      altcase = towlower((wint_t)wc);
++	    REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
++				 range_sts_al, work_mbc->nranges + 1);
++	    work_mbc->range_sts[work_mbc->nranges] = (wchar_t)altcase;
++	    
++	    altcase = wc2;
++	    if (iswlower((wint_t)wc2))
++	      altcase = towupper((wint_t)wc2);
++	    else
++	      altcase = towlower((wint_t)wc2);
++	    REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
++				 range_ends_al, work_mbc->nranges + 1);
++	    work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)altcase;
++	    
++	  }
+ 	}
+       else if (wc != WEOF)
+ 	/* build normal characters.  */
+@@ -688,6 +722,20 @@
+ 	  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
+ 			       work_mbc->nchars + 1);
+ 	  work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc;
++	  if (case_fold && (iswlower((wint_t) wc) || iswupper((wint_t) wc)))
++	    {
++		wint_t altcase;
++
++		altcase = wc;		/* keeps compiler happy */
++		if (iswlower((wint_t) wc))
++		  altcase = towupper((wint_t) wc);
++		else if (iswupper((wint_t) wc))
++		  altcase = towlower((wint_t) wc);
++
++		REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
++			       work_mbc->nchars + 1);
++		work_mbc->chars[work_mbc->nchars++] = (wchar_t) altcase;
++	    }
+ 	}
+     }
+   while ((wc = wc1) != L']');
+diff -Naur grep-2.5.3.orig/src/grep.c grep-2.5.3.lfs/src/grep.c
+--- grep-2.5.3.orig/src/grep.c	2007-06-28 19:57:19.000000000 +0100
++++ grep-2.5.3.lfs/src/grep.c	2008-02-10 18:54:53.000000000 +0000
+@@ -274,6 +274,12 @@
+ #endif
+ ;
+ 
++/* Default for `file_list' if no files are given on the command line. */
++static char *stdin_argv[] =
++{
++  "-", NULL
++};
++
+ /* Non-boolean long options that have no corresponding short equivalents.  */
+ enum
+ {
+@@ -534,7 +540,16 @@
+ 	 for byte sentinels fore and aft.  */
+       newalloc = newsize + pagesize + 1;
+ 
+-      newbuf = bufalloc < newalloc ? xmalloc (bufalloc = newalloc) : buffer;
++      newbuf = bufalloc < newalloc ? malloc (bufalloc = newalloc) : buffer;
++      if (newbuf == NULL)
++	{
++	  int saved_errno = errno;
++	  free (buffer);
++	  bufalloc = ALIGN_TO (INITIAL_BUFSIZE, pagesize) + pagesize + 1;
++	  buffer = xmalloc (bufalloc);
++	  errno = saved_errno;
++	  return 0;
++	}
+       readbuf = ALIGN_TO (newbuf + 1 + save, pagesize);
+       bufbeg = readbuf - save;
+       memmove (bufbeg, buffer + saved_offset, save);
+@@ -1825,6 +1840,7 @@
+   FILE *fp;
+   extern char *optarg;
+   extern int optind;
++  char **file_list;
+ 
+   initialize_main (&argc, &argv);
+   program_name = argv[0];
+@@ -2244,29 +2260,29 @@
+   if (max_count == 0)
+     exit (1);
+ 
+-  if (optind < argc)
++  file_list = (optind == argc ? stdin_argv : &argv[optind]);
++
++  status = 1;
++  while (1)
+     {
+-	status = 1;
+-	do
++      char *file = *file_list++;
++
++      if (file == NULL)
++	break;
++
++      if ((included_patterns || excluded_patterns)
++	  && !isdir (file))
+ 	{
+-	  char *file = argv[optind];
+-	  if ((included_patterns || excluded_patterns)
+-	      && !isdir (file))
+-	    {
+-	      if (included_patterns &&
+-		  ! excluded_filename (included_patterns, file, 0))
+-		continue;
+-	      if (excluded_patterns &&
+-		  excluded_filename (excluded_patterns, file, 0))
+-		continue;
+-	    }
+-	  status &= grepfile (strcmp (file, "-") == 0 ? (char *) NULL : file,
+-			      &stats_base);
++	  if (included_patterns &&
++	      ! excluded_filename (included_patterns, file, 0))
++	    continue;
++	  if (excluded_patterns &&
++	      excluded_filename (excluded_patterns, file, 0))
++	    continue;
+ 	}
+-	while ( ++optind < argc);
++      status &= grepfile (strcmp (file, "-") == 0
++			  ? (char *) NULL : file, &stats_base);
+     }
+-  else
+-    status = grepfile ((char *) NULL, &stats_base);
+ 
+   /* We register via atexit() to test stdout.  */
+   exit (errseen ? 2 : status);
+diff -Naur grep-2.5.3.orig/src/search.c grep-2.5.3.lfs/src/search.c
+--- grep-2.5.3.orig/src/search.c	2007-06-28 19:57:19.000000000 +0100
++++ grep-2.5.3.lfs/src/search.c	2008-02-10 18:56:18.000000000 +0000
+@@ -18,10 +18,15 @@
+ 
+ /* Written August 1992 by Mike Haertel. */
+ 
++#ifndef _GNU_SOURCE
++# define _GNU_SOURCE 1
++#endif
+ #ifdef HAVE_CONFIG_H
+ # include <config.h>
+ #endif
+ 
++#include <assert.h>
++
+ #include <sys/types.h>
+ 
+ #include "mbsupport.h"
+@@ -43,6 +48,9 @@
+ #ifdef HAVE_LIBPCRE
+ # include <pcre.h>
+ #endif
++#ifdef HAVE_LANGINFO_CODESET
++# include <langinfo.h>
++#endif
+ 
+ #define NCHAR (UCHAR_MAX + 1)
+ 
+@@ -68,6 +76,19 @@
+     error (2, 0, _("memory exhausted"));
+ }
+ 
++/* UTF-8 encoding allows some optimizations that we can't otherwise
++   assume in a multibyte encoding. */
++static int using_utf8;
++
++void
++check_utf8 (void)
++{
++#ifdef HAVE_LANGINFO_CODESET
++  if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0)
++    using_utf8 = 1;
++#endif
++}
++
+ #ifndef FGREP_PROGRAM
+ /* DFA compiled regexp. */
+ static struct dfa dfa;
+@@ -134,49 +155,6 @@
+ }
+ #endif /* !FGREP_PROGRAM */
+ 
+-#ifdef MBS_SUPPORT
+-/* This function allocate the array which correspond to "buf".
+-   Then this check multibyte string and mark on the positions which
+-   are not single byte character nor the first byte of a multibyte
+-   character.  Caller must free the array.  */
+-static char*
+-check_multibyte_string(char const *buf, size_t size)
+-{
+-  char *mb_properties = xmalloc(size);
+-  mbstate_t cur_state;
+-  wchar_t wc;
+-  int i;
+-
+-  memset(&cur_state, 0, sizeof(mbstate_t));
+-  memset(mb_properties, 0, sizeof(char)*size);
+-
+-  for (i = 0; i < size ;)
+-    {
+-      size_t mbclen;
+-      mbclen = mbrtowc(&wc, buf + i, size - i, &cur_state);
+-
+-      if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
+-	{
+-	  /* An invalid sequence, or a truncated multibyte character.
+-	     We treat it as a single byte character.  */
+-	  mbclen = 1;
+-	}
+-      else if (match_icase)
+-	{
+-	  if (iswupper((wint_t)wc))
+-	    {
+-	      wc = towlower((wint_t)wc);
+-	      wcrtomb(buf + i, wc, &cur_state);
+-	    }
+-	}
+-      mb_properties[i] = mbclen;
+-      i += mbclen;
+-    }
+-
+-  return mb_properties;
+-}
+-#endif /* MBS_SUPPORT */
+-
+ #if defined(GREP_PROGRAM) || defined(EGREP_PROGRAM)
+ #ifdef EGREP_PROGRAM
+ COMPILE_FCT(Ecompile)
+@@ -193,10 +171,9 @@
+   size_t total = size;
+   char const *motif = pattern;
+ 
+-#if 0
++  check_utf8 ();
+   if (match_icase)
+     syntax_bits |= RE_ICASE;
+-#endif
+   re_set_syntax (syntax_bits);
+   dfasyntax (syntax_bits, match_icase, eolbyte);
+ 
+@@ -301,23 +278,35 @@
+   char eol = eolbyte;
+   int backref, start, len, best_len;
+   struct kwsmatch kwsm;
++  static int use_dfa;
++  static int use_dfa_checked = 0;
+   size_t i, ret_val;
+ #ifdef MBS_SUPPORT
+-  char *mb_properties = NULL;
+-  if (MB_CUR_MAX > 1)
++  const char *last_char = NULL;
++  int mb_cur_max = MB_CUR_MAX;
++  mbstate_t mbs;
++  memset (&mbs, '\0', sizeof (mbstate_t));
++#endif /* MBS_SUPPORT */
++
++  if (!use_dfa_checked)
+     {
+-      if (match_icase)
+-        {
+-          char *case_buf = xmalloc(size);
+-          memcpy(case_buf, buf, size);
+-	  if (start_ptr)
+-	    start_ptr = case_buf + (start_ptr - buf);
+-          buf = case_buf;
+-        }
+-      if (kwset)
+-        mb_properties = check_multibyte_string(buf, size);
+-    }
++      char *grep_use_dfa = getenv ("GREP_USE_DFA");
++      if (!grep_use_dfa)
++	{
++#ifdef MBS_SUPPORT
++	  /* Turn off DFA when processing multibyte input. */
++	  use_dfa = (MB_CUR_MAX == 1);
++#else
++	  use_dfa = 1;
+ #endif /* MBS_SUPPORT */
++	}
++      else
++	{
++	  use_dfa = atoi (grep_use_dfa);
++	}
++
++      use_dfa_checked = 1;
++    }
+ 
+   buflim = buf + size;
+ 
+@@ -329,40 +318,123 @@
+ 	  if (kwset)
+ 	    {
+ 	      /* Find a possible match using the KWset matcher. */
+-	      size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
++#ifdef MBS_SUPPORT
++	      size_t bytes_left = 0;
++#endif /* MBS_SUPPORT */
++	      size_t offset;
++#ifdef MBS_SUPPORT
++	      /* kwsexec doesn't work with match_icase and multibyte input. */
++	      if (match_icase && mb_cur_max > 1)
++		/* Avoid kwset */
++		offset = 0;
++	      else
++#endif /* MBS_SUPPORT */
++	      offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
+ 	      if (offset == (size_t) -1)
+-		goto failure;
++		return (size_t)-1;
++#ifdef MBS_SUPPORT
++	      if (mb_cur_max > 1 && !using_utf8)
++		{
++		  bytes_left = offset;
++		  while (bytes_left)
++		    {
++		      size_t mlen = mbrlen (beg, bytes_left, &mbs);
++
++		      last_char = beg;
++		      if (mlen == (size_t) -1 || mlen == 0)
++			{
++			  /* Incomplete character: treat as single-byte. */
++			  memset (&mbs, '\0', sizeof (mbstate_t));
++			  beg++;
++			  bytes_left--;
++			  continue;
++			}
++
++		      if (mlen == (size_t) -2)
++			/* Offset points inside multibyte character:
++			 * no good. */
++			break;
++
++		      beg += mlen;
++		      bytes_left -= mlen;
++		    }
++		}
++	      else
++#endif /* MBS_SUPPORT */
+ 	      beg += offset;
+ 	      /* Narrow down to the line containing the candidate, and
+ 		 run it through DFA. */
+ 	      end = memchr(beg, eol, buflim - beg);
+ 	      end++;
+ #ifdef MBS_SUPPORT
+-	      if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
++	      if (mb_cur_max > 1 && bytes_left)
+ 		continue;
+ #endif
+ 	      while (beg > buf && beg[-1] != eol)
+ 		--beg;
+-	      if (kwsm.index < kwset_exact_matches)
++	      if (
++#ifdef MBS_SUPPORT
++		  !(match_icase && mb_cur_max > 1) &&
++#endif /* MBS_SUPPORT */
++		  (kwsm.index < kwset_exact_matches))
+ 		goto success;
+-	      if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
++	      if (use_dfa &&
++		  dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
+ 		continue;
+ 	    }
+ 	  else
+ 	    {
+ 	      /* No good fixed strings; start with DFA. */
+-	      size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref);
++#ifdef MBS_SUPPORT
++	      size_t bytes_left = 0;
++#endif /* MBS_SUPPORT */
++	      size_t offset = 0;
++	      if (use_dfa)
++		offset = dfaexec (&dfa, beg, buflim - beg, &backref);
+ 	      if (offset == (size_t) -1)
+ 		break;
+ 	      /* Narrow down to the line we've found. */
++#ifdef MBS_SUPPORT
++	      if (mb_cur_max > 1 && !using_utf8)
++		{
++		  bytes_left = offset;
++		  while (bytes_left)
++		    {
++		      size_t mlen = mbrlen (beg, bytes_left, &mbs);
++
++		      last_char = beg;
++		      if (mlen == (size_t) -1 || mlen == 0)
++			{
++			  /* Incomplete character: treat as single-byte. */
++			  memset (&mbs, '\0', sizeof (mbstate_t));
++			  beg++;
++			  bytes_left--;
++			  continue;
++			}
++
++		      if (mlen == (size_t) -2)
++			/* Offset points inside multibyte character:
++			 * no good. */
++			break;
++
++		      beg += mlen;
++		      bytes_left -= mlen;
++		    }
++		}
++	      else
++#endif /* MBS_SUPPORT */
+ 	      beg += offset;
+ 	      end = memchr (beg, eol, buflim - beg);
+ 	      end++;
++#ifdef MBS_SUPPORT
++	      if (mb_cur_max > 1 && bytes_left)
++		continue;
++#endif /* MBS_SUPPORT */
+ 	      while (beg > buf && beg[-1] != eol)
+ 		--beg;
+ 	    }
+ 	  /* Successful, no backreferences encountered! */
+-	  if (!backref)
++	  if (use_dfa && !backref)
+ 	    goto success;
+ 	}
+       else
+@@ -408,10 +480,84 @@
+ 	      if (match_words)
+ 		while (match <= best_match)
+ 		  {
+-		    if ((match == buf || !WCHAR ((unsigned char) match[-1]))
+-			&& (len == end - beg - 1
+-			    || !WCHAR ((unsigned char) match[len])))
+-		      goto assess_pattern_match;
++		    int lword_match = 0;
++		    if (match == buf)
++		      lword_match = 1;
++		    else
++		      {
++			assert (start > 0);
++#ifdef MBS_SUPPORT
++			if (mb_cur_max > 1)
++			  {
++			    const char *s;
++			    int mr;
++			    wchar_t pwc;
++			    if (using_utf8)
++			      {
++				s = match - 1;
++				while (s > buf
++				       && (unsigned char) *s >= 0x80
++				       && (unsigned char) *s <= 0xbf)
++				  --s;
++			      }
++			    else
++			      s = last_char;
++			    mr = mbtowc (&pwc, s, match - s);
++			    if (mr <= 0)
++			      {
++				memset (&mbs, '\0', sizeof (mbstate_t));
++				lword_match = 1;
++			      }
++			    else if (!(iswalnum (pwc) || pwc == L'_')
++				     && mr == (int) (match - s))
++			      lword_match = 1;
++			  }
++			else
++#endif /* MBS_SUPPORT */
++			if (!WCHAR ((unsigned char) match[-1]))
++			  lword_match = 1;
++		      }
++
++		    if (lword_match)
++		      {
++			int rword_match = 0;
++			if (start + len == end - beg - 1)
++			  rword_match = 1;
++			else
++			  {
++#ifdef MBS_SUPPORT
++			    if (mb_cur_max > 1)
++			      {
++				wchar_t nwc;
++				int mr;
++
++				mr = mbtowc (&nwc, buf + start + len,
++					     end - buf - start - len - 1);
++				if (mr <= 0)
++				  {
++				    memset (&mbs, '\0', sizeof (mbstate_t));
++				    rword_match = 1;
++				  }
++				else if (!iswalnum (nwc) && nwc != L'_')
++				  rword_match = 1;
++			      }
++			    else
++#endif /* MBS_SUPPORT */
++			    if (!WCHAR ((unsigned char) match[len]))
++			      rword_match = 1;
++			  }
++
++			if (rword_match)
++			  {
++			    if (!start_ptr)
++			      /* Returns the whole line. */
++			      goto success;
++			    else
++			      {
++				goto assess_pattern_match;
++			      }
++			  }
++		      }
+ 		    if (len > 0)
+ 		      {
+ 			/* Try a shorter length anchored at the same place. */
+@@ -475,24 +621,144 @@
+   *match_size = len;
+   ret_val = beg - buf;
+  out:
+-#ifdef MBS_SUPPORT
+-  if (MB_CUR_MAX > 1)
+-    {
+-      if (match_icase)
+-        free((char*)buf);
+-      if (mb_properties)
+-        free(mb_properties);
+-    }
+-#endif /* MBS_SUPPORT */
+   return ret_val;
+ }
+ #endif /* defined(GREP_PROGRAM) || defined(EGREP_PROGRAM) */
+ 
++#ifdef MBS_SUPPORT
++static int f_i_multibyte; /* whether we're using the new -Fi MB method */
++static struct
++{
++  wchar_t **patterns;
++  size_t count, maxlen;
++  unsigned char *match;
++} Fimb;
++#endif
++
+ #if defined(GREP_PROGRAM) || defined(FGREP_PROGRAM)
+ COMPILE_FCT(Fcompile)
+ {
++  int mb_cur_max = MB_CUR_MAX;
+   char const *beg, *lim, *err;
+ 
++  check_utf8 ();
++#ifdef MBS_SUPPORT
++  /* Support -F -i for UTF-8 input. */
++  if (match_icase && mb_cur_max > 1)
++    {
++      mbstate_t mbs;
++      wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t));
++      const char *patternend = pattern;
++      size_t wcsize;
++      kwset_t fimb_kwset = NULL;
++      char *starts = NULL;
++      wchar_t *wcbeg, *wclim;
++      size_t allocated = 0;
++
++      memset (&mbs, '\0', sizeof (mbs));
++# ifdef __GNU_LIBRARY__
++      wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs);
++      if (patternend != pattern + size)
++	wcsize = (size_t) -1;
++# else
++      {
++	char *patterncopy = xmalloc (size + 1);
++
++	memcpy (patterncopy, pattern, size);
++	patterncopy[size] = '\0';
++	patternend = patterncopy;
++	wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs);
++	if (patternend != patterncopy + size)
++	  wcsize = (size_t) -1;
++	free (patterncopy);
++      }
++# endif
++      if (wcsize + 2 <= 2)
++	{
++fimb_fail:
++	  free (wcpattern);
++	  free (starts);
++	  if (fimb_kwset)
++	    kwsfree (fimb_kwset);
++	  free (Fimb.patterns);
++	  Fimb.patterns = NULL;
++	}
++      else
++	{
++	  if (!(fimb_kwset = kwsalloc (NULL)))
++	    error (2, 0, _("memory exhausted"));
++
++	  starts = xmalloc (mb_cur_max * 3);
++	  wcbeg = wcpattern;
++	  do
++	    {
++	      int i;
++	      size_t wclen;
++
++	      if (Fimb.count >= allocated)
++		{
++		  if (allocated == 0)
++		    allocated = 128;
++		  else
++		    allocated *= 2;
++		  Fimb.patterns = xrealloc (Fimb.patterns,
++					    sizeof (wchar_t *) * allocated);
++		}
++	      Fimb.patterns[Fimb.count++] = wcbeg;
++	      for (wclim = wcbeg;
++		   wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim)
++		*wclim = towlower (*wclim);
++	      *wclim = L'\0';
++	      wclen = wclim - wcbeg;
++	      if (wclen > Fimb.maxlen)
++		Fimb.maxlen = wclen;
++	      if (wclen > 3)
++		wclen = 3;
++	      if (wclen == 0)
++		{
++		  if ((err = kwsincr (fimb_kwset, "", 0)) != 0)
++		    error (2, 0, err);
++		}
++	      else
++		for (i = 0; i < (1 << wclen); i++)
++		  {
++		    char *p = starts;
++		    int j, k;
++
++		    for (j = 0; j < wclen; ++j)
++		      {
++			wchar_t wc = wcbeg[j];
++			if (i & (1 << j))
++			  {
++			    wc = towupper (wc);
++			    if (wc == wcbeg[j])
++			      continue;
++			  }
++			k = wctomb (p, wc);
++			if (k <= 0)
++			  goto fimb_fail;
++			p += k;
++		      }
++		    if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0)
++		      error (2, 0, err);
++		  }
++	      if (wclim < wcpattern + wcsize)
++		++wclim;
++	      wcbeg = wclim;
++	    }
++	  while (wcbeg < wcpattern + wcsize);
++	  f_i_multibyte = 1;
++	  kwset = fimb_kwset;
++	  free (starts);
++	  Fimb.match = xmalloc (Fimb.count);
++	  if ((err = kwsprep (kwset)) != 0)
++	    error (2, 0, err);
++	  return;
++	}
++    }
++#endif /* MBS_SUPPORT */
++
++
+   kwsinit ();
+   beg = pattern;
+   do
+@@ -511,6 +777,76 @@
+     error (2, 0, err);
+ }
+ 
++#ifdef MBS_SUPPORT
++static int
++Fimbexec (const char *buf, size_t size, size_t *plen, int exact)
++{
++  size_t len, letter, i;
++  int ret = -1;
++  mbstate_t mbs;
++  wchar_t wc;
++  int patterns_left;
++
++  assert (match_icase && f_i_multibyte == 1);
++  assert (MB_CUR_MAX > 1);
++
++  memset (&mbs, '\0', sizeof (mbs));
++  memset (Fimb.match, '\1', Fimb.count);
++  letter = len = 0;
++  patterns_left = 1;
++  while (patterns_left && len <= size)
++    {
++      size_t c;
++
++      patterns_left = 0;
++      if (len < size)
++	{
++	  c = mbrtowc (&wc, buf + len, size - len, &mbs);
++	  if (c + 2 <= 2)
++	    return ret;
++
++	  wc = towlower (wc);
++	}
++      else
++	{
++	  c = 1;
++	  wc = L'\0';
++	}
++
++      for (i = 0; i < Fimb.count; i++)
++	{
++	  if (Fimb.match[i])
++	    {
++	      if (Fimb.patterns[i][letter] == L'\0')
++		{
++		  /* Found a match. */
++		  *plen = len;
++		  if (!exact && !match_words)
++		    return 0;
++		  else
++		    {
++		      /* For -w or exact look for longest match.  */
++		      ret = 0;
++		      Fimb.match[i] = '\0';
++		      continue;
++		    }
++		}
++
++	      if (Fimb.patterns[i][letter] == wc)
++		patterns_left = 1;
++	      else
++		Fimb.match[i] = '\0';
++	    }
++	}
++
++      len += c;
++      letter++;
++    }
++
++  return ret;
++}
++#endif /* MBS_SUPPORT */
++
+ EXECUTE_FCT(Fexecute)
+ {
+   register char const *beg, *try, *end;
+@@ -519,69 +855,256 @@
+   struct kwsmatch kwsmatch;
+   size_t ret_val;
+ #ifdef MBS_SUPPORT
+-  char *mb_properties = NULL;
+-  if (MB_CUR_MAX > 1)
+-    {
+-      if (match_icase)
+-        {
+-          char *case_buf = xmalloc(size);
+-          memcpy(case_buf, buf, size);
+-	  if (start_ptr)
+-	    start_ptr = case_buf + (start_ptr - buf);
+-          buf = case_buf;
+-        }
+-      mb_properties = check_multibyte_string(buf, size);
+-    }
++  int mb_cur_max = MB_CUR_MAX;
++  mbstate_t mbs;
++  memset (&mbs, '\0', sizeof (mbstate_t));
++  const char *last_char = NULL;
+ #endif /* MBS_SUPPORT */
+ 
+   for (beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
+     {
+       size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
+       if (offset == (size_t) -1)
+-	goto failure;
++	return offset;
+ #ifdef MBS_SUPPORT
+-      if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
+-	continue; /* It is a part of multibyte character.  */
++      if (mb_cur_max > 1 && !using_utf8)
++	{
++	  size_t bytes_left = offset;
++	  while (bytes_left)
++	    {
++	      size_t mlen = mbrlen (beg, bytes_left, &mbs);
++
++	      last_char = beg;
++	      if (mlen == (size_t) -1 || mlen == 0)
++		{
++		  /* Incomplete character: treat as single-byte. */
++		  memset (&mbs, '\0', sizeof (mbstate_t));
++		  beg++;
++		  bytes_left--;
++		  continue;
++		}
++
++	      if (mlen == (size_t) -2)
++		/* Offset points inside multibyte character: no good. */
++		break;
++
++	      beg += mlen;
++	      bytes_left -= mlen;
++	    }
++
++	  if (bytes_left)
++	    continue;
++	}
++      else
+ #endif /* MBS_SUPPORT */
+       beg += offset;
++#ifdef MBS_SUPPORT
++      /* For f_i_multibyte, the string at beg now matches first 3 chars of
++	 one of the search strings (less if there are shorter search strings).
++	 See if this is a real match.  */
++      if (f_i_multibyte
++	  && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], start_ptr == NULL))
++	goto next_char;
++#endif /* MBS_SUPPORT */
+       len = kwsmatch.size[0];
+       if (start_ptr && !match_words)
+ 	goto success_in_beg_and_len;
+       if (match_lines)
+ 	{
+ 	  if (beg > buf && beg[-1] != eol)
+-	    continue;
++	    goto next_char;
+ 	  if (beg + len < buf + size && beg[len] != eol)
+-	    continue;
++	    goto next_char;
+ 	  goto success;
+ 	}
+       else if (match_words)
+-	for (try = beg; len; )
+-	  {
+-	    if (try > buf && WCHAR((unsigned char) try[-1]))
+-	      break;
+-	    if (try + len < buf + size && WCHAR((unsigned char) try[len]))
+-	      {
+-		offset = kwsexec (kwset, beg, --len, &kwsmatch);
+-		if (offset == (size_t) -1)
+-		  break;
+-		try = beg + offset;
+-		len = kwsmatch.size[0];
+-	      }
+-	    else if (!start_ptr)
+-	      goto success;
+-	    else
+-	      goto success_in_beg_and_len;
+-	  } /* for (try) */
+-      else
++	{
++	  while (len)
++	    {
++	      int word_match = 0;
++	      if (beg > buf)
++		{
++#ifdef MBS_SUPPORT
++		  if (mb_cur_max > 1)
++		    {
++		      const char *s;
++		      int mr;
++		      wchar_t pwc;
++
++		      if (using_utf8)
++			{
++			  s = beg - 1;
++			  while (s > buf
++				 && (unsigned char) *s >= 0x80
++				 && (unsigned char) *s <= 0xbf)
++			    --s;
++			}
++		      else
++			s = last_char;
++		      mr = mbtowc (&pwc, s, beg - s);
++		      if (mr <= 0)
++			memset (&mbs, '\0', sizeof (mbstate_t));
++		      else if ((iswalnum (pwc) || pwc == L'_')
++			       && mr == (int) (beg - s))
++			goto next_char;
++		    }
++		  else
++#endif /* MBS_SUPPORT */
++		  if (WCHAR ((unsigned char) beg[-1]))
++		    goto next_char;
++		}
++#ifdef MBS_SUPPORT
++	      if (mb_cur_max > 1)
++		{
++		  wchar_t nwc;
++		  int mr;
++
++		  mr = mbtowc (&nwc, beg + len, buf + size - beg - len);
++		  if (mr <= 0)
++		    {
++		      memset (&mbs, '\0', sizeof (mbstate_t));
++		      word_match = 1;
++		    }
++		  else if (!iswalnum (nwc) && nwc != L'_')
++		    word_match = 1;
++		}
++	      else
++#endif /* MBS_SUPPORT */
++		if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len]))
++		  word_match = 1;
++	      if (word_match)
++		{
++		  if (start_ptr == NULL)
++		    /* Returns the whole line now we know there's a word match. */
++		    goto success;
++		  else {
++		    /* Returns just this word match. */
++		    *match_size = len;
++		    return beg - buf;
++		  }
++		}
++	      if (len > 0)
++		{
++		  /* Try a shorter length anchored at the same place. */
++		  --len;
++		  offset = kwsexec (kwset, beg, len, &kwsmatch);
++
++		  if (offset == -1)
++		    goto next_char; /* Try a different anchor. */
++#ifdef MBS_SUPPORT
++
++		  if (mb_cur_max > 1 && !using_utf8)
++		    {
++		      size_t bytes_left = offset;
++		      while (bytes_left)
++			{
++			  size_t mlen = mbrlen (beg, bytes_left, &mbs);
++
++			  last_char = beg;
++			  if (mlen == (size_t) -1 || mlen == 0)
++			    {
++			      /* Incomplete character: treat as single-byte. */
++			      memset (&mbs, '\0', sizeof (mbstate_t));
++			      beg++;
++			      bytes_left--;
++			      continue;
++			    }
++
++			  if (mlen == (size_t) -2)
++			    {
++			      /* Offset points inside multibyte character:
++			       * no good. */
++			      break;
++			    }
++
++			  beg += mlen;
++			  bytes_left -= mlen;
++			}
++
++		      if (bytes_left)
++			{
++			  memset (&mbs, '\0', sizeof (mbstate_t));
++			  goto next_char; /* Try a different anchor. */
++			}
++		    }
++		  else
++#endif /* MBS_SUPPORT */
++		  beg += offset;
++#ifdef MBS_SUPPORT
++		  /* The string at beg now matches first 3 chars of one of
++		     the search strings (less if there are shorter search
++		     strings).  See if this is a real match.  */
++		  if (f_i_multibyte
++		      && Fimbexec (beg, len - offset, &kwsmatch.size[0],
++				   start_ptr == NULL))
++		    goto next_char;
++#endif /* MBS_SUPPORT */
++		  len = kwsmatch.size[0];
++		}
++	    }
++	}
++       else
+ 	goto success;
+-    } /* for (beg in buf) */
++next_char:;
++#ifdef MBS_SUPPORT
++      /* Advance to next character.  For MB_CUR_MAX == 1 case this is handled
++	 by ++beg above.  */
++      if (mb_cur_max > 1)
++	{
++	  if (using_utf8)
++	    {
++	      unsigned char c = *beg;
++	      if (c >= 0xc2)
++		{
++		  if (c < 0xe0)
++		    ++beg;
++		  else if (c < 0xf0)
++		    beg += 2;
++		  else if (c < 0xf8)
++		    beg += 3;
++		  else if (c < 0xfc)
++		    beg += 4;
++		  else if (c < 0xfe)
++		    beg += 5;
++		}
++	    }
++	  else
++	    {
++	      size_t l = mbrlen (beg, buf + size - beg, &mbs);
+ 
+- failure:
+-  ret_val = -1;
+-  goto out;
++	      last_char = beg;
++	      if (l + 2 >= 2)
++		beg += l - 1;
++	      else
++		memset (&mbs, '\0', sizeof (mbstate_t));
++	    }
++	}
++#endif /* MBS_SUPPORT */
++    }
++
++  return -1;
+ 
+  success:
++#ifdef MBS_SUPPORT
++  if (mb_cur_max > 1 && !using_utf8)
++    {
++      end = beg + len;
++      while (end < buf + size)
++	{
++	  size_t mlen = mbrlen (end, buf + size - end, &mbs);
++	  if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0)
++	    {
++	      memset (&mbs, '\0', sizeof (mbstate_t));
++	      mlen = 1;
++	    }
++	  if (mlen == 1 && *end == eol)
++	    break;
++
++	  end += mlen;
++	}
++     }
++  else
++ #endif /* MBS_SUPPORT */
+   end = memchr (beg + len, eol, (buf + size) - (beg + len));
+   end++;
+   while (buf < beg && beg[-1] != eol)
+@@ -591,15 +1114,6 @@
+   *match_size = len;
+   ret_val = beg - buf;
+  out:
+-#ifdef MBS_SUPPORT
+-  if (MB_CUR_MAX > 1)
+-    {
+-      if (match_icase)
+-        free((char*)buf);
+-      if (mb_properties)
+-        free(mb_properties);
+-    }
+-#endif /* MBS_SUPPORT */
+   return ret_val;
+ }
+ #endif /* defined(GREP_PROGRAM) || defined(FGREP_PROGRAM) */

Added: trunk/grep/grep-2.5.3-upstream_fixes-1.patch
===================================================================
--- trunk/grep/grep-2.5.3-upstream_fixes-1.patch	                        (rev 0)
+++ trunk/grep/grep-2.5.3-upstream_fixes-1.patch	2008-02-19 17:27:21 UTC (rev 1905)
@@ -0,0 +1,119 @@
+Date: 2008-02-19
+Initial Package Version: 2.5.3
+Upstream Status: from upstream, with one slight change to expected test result.
+Origin: from upstream.
+Description: Various fixes, particularly for man and info pages, add a new test to foad1
+(expected results have 'standard input' label added to match the result with the debian fixes)
+and commenting out the (new) tests which are expected to fail.
+
+diff -Naur grep-2.5.3.with-deb-patches/doc/grep.1 grep-2.5.3.lfs/doc/grep.1
+--- grep-2.5.3.with-deb-patches/doc/grep.1	2006-08-18 23:00:31.000000000 +0100
++++ grep-2.5.3.lfs/doc/grep.1	2008-02-10 18:59:10.000000000 +0000
+@@ -21,7 +21,7 @@
+ . \" ISO 8601 date, complete format, extended representation
+ . ds Dt \\*(Yr-\\*(Mn-\\*(Dy
+ ..
+-.Id $Id: grep.1,v 1.38 2006/08/18 22:00:31 bero Exp $
++.Id $Id: grep.1,v 1.39 2008/02/07 03:43:27 taa Exp $
+ .TH GREP 1 \*(Dt "GNU grep 2.5.1-cvs" "User Commands"
+ .hy 0
+ .
+@@ -125,7 +125,8 @@
+ Use
+ .I PATTERN
+ as the pattern.
+-This is useful to protect patterns beginning with hyphen-minus
++This can be used to specify multiple search patterns,
++or to protect a pattern beginning with a hyphen
+ .RB ( \- ).
+ .RB ( \-e
+ is specified by \s-1POSIX\s0.)
+diff -Naur grep-2.5.3.with-deb-patches/doc/grep.texi grep-2.5.3.lfs/doc/grep.texi
+--- grep-2.5.3.with-deb-patches/doc/grep.texi	2006-08-18 23:00:31.000000000 +0100
++++ grep-2.5.3.lfs/doc/grep.texi	2008-02-10 18:59:10.000000000 +0000
+@@ -211,8 +211,9 @@
+ @opindex -e
+ @opindex --regexp=@var{pattern}
+ @cindex pattern list
+-Use @var{pattern} as the pattern;
+-useful to protect patterns beginning with a @samp{-}.
++Use @var{pattern} as the pattern.
++This can be used to specify multiple search patterns,
++or to protect a pattern beginning with a @samp{-}.
+ (@samp{-e} is specified by @sc{posix}.)
+ 
+  @item -f @var{file}
+diff -Naur grep-2.5.3.with-deb-patches/tests/foad1.sh grep-2.5.3.lfs/tests/foad1.sh
+--- grep-2.5.3.with-deb-patches/tests/foad1.sh	2005-11-18 20:02:22.000000000 +0000
++++ grep-2.5.3.lfs/tests/foad1.sh	2008-02-10 19:01:06.000000000 +0000
+@@ -34,9 +34,10 @@
+ # Test "--only-matching" ("-o") option
+ 
+ # "-o" with "-i" should output an exact copy of the matching input text.
+-grep_test "WordA/wordB/WORDC/" "Word/word/WORD/" "Word" -o -i
+ grep_test "WordA/wordB/WORDC/" "Word/word/WORD/" "word" -o -i
+-grep_test "WordA/wordB/WORDC/" "Word/word/WORD/" "WORD" -o -i
++# Comment out cases that are known to fail. These should be uncommented after the 2.5.4 release. TAA.
++#grep_test "WordA/wordB/WORDC/" "Word/word/WORD/" "Word" -o -i
++#grep_test "WordA/wordB/WORDC/" "Word/word/WORD/" "WORD" -o -i
+ 
+ # Should display the line number (-n), octet offset (-b), or file name
+ # (-H) of every match, not just of the first match on each input line.
+@@ -55,6 +56,13 @@
+ grep_test "wA wB/" "(standard input):wA/(standard input):wB/" "w." -o -H -i
+ grep_test "wA wB/" "(standard input):wA/(standard input):wB/" "w." -o -H -3 2>/dev/null
+ 
++# Combination of -h and -H
++grep_test "wA wB/" "wA wB/" "w."
++grep_test "wA wB/" "wA wB/" "w." -h
++grep_test "wA wB/" "(standard input):wA wB/" "w." -H -h
++grep_test "wA wB/" "(standard input):wA wB/" "w." -H
++grep_test "wA wB/" "(standard input):wA wB/" "w." -h -H
++
+ # End of a previous match should not match a "start of ..." expression.
+ grep_test "word_word/" "word_/" "^word_*" -o
+ grep_test "wordword/" "word/" "\<word" -o
+@@ -66,9 +74,10 @@
+ CE=""
+ 
+ # "--color" with "-i" should output an exact copy of the matching input text.
+-grep_test "WordA/wordb/WORDC/" "${CB}Word${CE}A/${CB}word${CE}b/${CB}WORD${CE}C/" "Word" --color=always -i
+ grep_test "WordA/wordb/WORDC/" "${CB}Word${CE}A/${CB}word${CE}b/${CB}WORD${CE}C/" "word" --color=always -i
+-grep_test "WordA/wordb/WORDC/" "${CB}Word${CE}A/${CB}word${CE}b/${CB}WORD${CE}C/" "WORD" --color=always -i
++# Comment out cases that are known to fail. These should be uncommented after the 2.5.4 release. TAA.
++#grep_test "WordA/wordb/WORDC/" "${CB}Word${CE}A/${CB}word${CE}b/${CB}WORD${CE}C/" "Word" --color=always -i
++#grep_test "WordA/wordb/WORDC/" "${CB}Word${CE}A/${CB}word${CE}b/${CB}WORD${CE}C/" "WORD" --color=always -i
+ 
+ # End of a previous match should not match a "start of ..." expression.
+ grep_test "word_word/" "${CB}word_${CE}word/" "^word_*" --color=always
+diff -Naur grep-2.5.3.with-deb-patches/tests/yesno.sh grep-2.5.3.lfs/tests/yesno.sh
+--- grep-2.5.3.with-deb-patches/tests/yesno.sh	2005-11-13 08:15:16.000000000 +0000
++++ grep-2.5.3.lfs/tests/yesno.sh	2008-02-10 18:59:10.000000000 +0000
+@@ -63,7 +63,6 @@
+   '-m,4,-C,1,-o'    "$c$d$e$h$z0$XI$XJ$XK$XL$XM$XN" \
+   '-m,5'            "$C$D$E$H$I$z0$XJ$XK$XL$XM$XN" \
+   '-m,5,-o'         "$c$d$e$h$i$z0$XJ$XK$XL$XM$XN" \
+-  '-m,5,-C,1'       "$rB$C$D$E$rF$rG$H$I$z0$XJ$XK$XL$XM$XN" \
+   '-m,5,-C,1,-o'    "$c$d$e$h$i$z0$XJ$XK$XL$XM$XN" \
+   '-m,6'            "$C$D$E$H$I$M$z0$XN" \
+   '-m,6,-o'         "$c$d$e$h$i$m$z0$XN" \
+@@ -79,14 +78,17 @@
+   '-m,1,-v,-C,1,-o' "$z0$XB$XC$XD$XE$XF$XG$XH$XI$XJ$XK$XL$XM$XN" \
+   '-m,2,-v'         "$A$B$z0$XC$XD$XE$XF$XG$XH$XI$XJ$XK$XL$XM$XN" \
+   '-m,2,-v,-o'      "$z0$XC$XD$XE$XF$XG$XH$XI$XJ$XK$XL$XM$XN" \
+-  '-m,2,-v,-C,1'    "$A$B$z0$XC$XD$XE$XF$XG$XH$XI$XJ$XK$XL$XM$XN" \
+-  '-m,2,-v,-C,1,-o' "$z0$XC$XD$XE$XF$XG$XH$XI$XJ$XK$XL$XM$XN" \
+   '-m,3,-v'         "$A$B$F$z0$XG$XH$XI$XJ$XK$XL$XM$XN" \
+   '-m,3,-v,-o'      "$z0$XG$XH$XI$XJ$XK$XL$XM$XN" \
+   '-m,3,-v,-C,1'    "$A$B$rC$s$rE$F$z0$XG$XH$XI$XJ$XK$XL$XM$XN" \
+   '-m,3,-v,-C,1,-o' "$rc$s$re$z0$XG$XH$XI$XJ$XK$XL$XM$XN" \
+   x
+ shift
++# Comment out cases that are known to fail. These should be uncommented after the 2.5.4 release. TAA.
++# These should be added back in above and fixed in the code. TAA.
++#  '-m,5,-C,1'       "$rB$C$D$E$rF$rG$H$I$z0$XJ$XK$XL$XM$XN" \
++#  '-m,2,-v,-C,1'    "$A$B$z0$XC$XD$XE$XF$XG$XH$XI$XJ$XK$XL$XM$XN" \
++#  '-m,2,-v,-C,1,-o' "$z0$XC$XD$XE$XF$XG$XH$XI$XJ$XK$XL$XM$XN" \
+ 
+ # Test execution and reporting.
+ t=1




More information about the patches mailing list