Modify loadSegmentLeavesInt() to correctly handle prefix searching.

The new function docListUnion() is used to accumulate a union of the hits for the matching terms, which will be merged across segments using docListMerge(). (CVS 3891) FossilOrigin-Name: 72c796307338c2751a91c30f6fb16989afbf3816
2007-05-01 17:14:59 +00:00 · 2007-05-01 17:14:59 +00:00 · cc3e986643
commit cc3e986643
parent c80f058db3
3 changed files with 170 additions and 30 deletions
--- a/ext/fts2/fts2.c
+++ b/ext/fts2/fts2.c
@ -708,6 +708,7 @@ static void docListValidate(DocListType iType, const char *pData, int nData,
 ** dlwInit - initialize to write a given type doclistto a buffer.
 ** dlwDestroy - clear the writer's memory.  Does not free buffer.
 ** dlwAppend - append raw doclist data to buffer.
+** dlwCopy - copy next doclist from reader to writer.
 ** dlwAdd - construct doclist element and append to buffer.
 **    Only apply dlwAdd() to DL_DOCIDS doclists (else use PLWriter).
 */
@ -771,6 +772,10 @@ static void dlwAppend(DLWriter *pWriter,
  }
  pWriter->iPrevDocid = iLastDocid;
 }
+static void dlwCopy(DLWriter *pWriter, DLReader *pReader){
+  dlwAppend(pWriter, dlrDocData(pReader), dlrDocDataBytes(pReader),
+            dlrDocid(pReader), dlrDocid(pReader));
+}
 static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid){
  char c[VARINT_MAX];
  int n = putVarint(c, iDocid-pWriter->iPrevDocid);
@ -886,6 +891,7 @@ static void plrDestroy(PLReader *pReader){
 ** plwInit - init for writing a document's poslist.
 ** plwDestroy - clear a writer.
 ** plwAdd - append position and offset information.
+** plwCopy - copy next position's data from reader to writer.
 ** plwTerminate - add any necessary doclist terminator.
 **
 ** Calling plwAdd() after plwTerminate() may result in a corrupt
@ -945,6 +951,10 @@ static void plwAdd(PLWriter *pWriter, int iColumn, int iPos,
  }
  dataBufferAppend(pWriter->dlw->b, c, n);
 }
+static void plwCopy(PLWriter *pWriter, PLReader *pReader){
+  plwAdd(pWriter, plrColumn(pReader), plrPosition(pReader),
+         plrStartOffset(pReader), plrEndOffset(pReader));
+}
 static void plwInit(PLWriter *pWriter, DLWriter *dlw, sqlite_int64 iDocid){
  char c[VARINT_MAX];
  int n;
@ -1218,6 +1228,122 @@ static void docListMerge(DataBuffer *out,
  dlwDestroy(&writer);
 }

+/* Helper function for posListUnion().  Compares the current position
+** between left and right, returning as standard C idiom of <0 if
+** left<right, >0 if left>right, and 0 if left==right.  "End" always
+** compares greater.
+*/
+static int posListCmp(PLReader *pLeft, PLReader *pRight){
+  assert( pLeft->iType==pRight->iType );
+  if( pLeft->iType==DL_DOCIDS ) return 0;
+
+  if( plrAtEnd(pLeft) ) return plrAtEnd(pRight) ? 0 : 1;
+  if( plrAtEnd(pRight) ) return -1;
+
+  if( plrColumn(pLeft)<plrColumn(pRight) ) return -1;
+  if( plrColumn(pLeft)>plrColumn(pRight) ) return 1;
+
+  if( plrPosition(pLeft)<plrPosition(pRight) ) return -1;
+  if( plrPosition(pLeft)>plrPosition(pRight) ) return 1;
+  if( pLeft->iType==DL_POSITIONS ) return 0;
+
+  if( plrStartOffset(pLeft)<plrStartOffset(pRight) ) return -1;
+  if( plrStartOffset(pLeft)>plrStartOffset(pRight) ) return 1;
+
+  if( plrEndOffset(pLeft)<plrEndOffset(pRight) ) return -1;
+  if( plrEndOffset(pLeft)>plrEndOffset(pRight) ) return 1;
+
+  return 0;
+}
+
+/* Write the union of position lists in pLeft and pRight to pOut.
+** "Union" in this case meaning "All unique position tuples".  Should
+** work with any doclist type, though both inputs and the output
+** should be the same type.
+*/
+static void posListUnion(DLReader *pLeft, DLReader *pRight, DLWriter *pOut){
+  PLReader left, right;
+  PLWriter writer;
+
+  assert( dlrDocid(pLeft)==dlrDocid(pRight) );
+  assert( pLeft->iType==pRight->iType );
+  assert( pLeft->iType==pOut->iType );
+
+  plrInit(&left, pLeft);
+  plrInit(&right, pRight);
+  plwInit(&writer, pOut, dlrDocid(pLeft));
+
+  while( !plrAtEnd(&left) || !plrAtEnd(&right) ){
+    int c = posListCmp(&left, &right);
+    if( c<0 ){
+      plwCopy(&writer, &left);
+      plrStep(&left);
+    }else if( c>0 ){
+      plwCopy(&writer, &right);
+      plrStep(&right);
+    }else{
+      plwCopy(&writer, &left);
+      plrStep(&left);
+      plrStep(&right);
+    }
+  }
+
+  plwTerminate(&writer);
+  plwDestroy(&writer);
+  plrDestroy(&left);
+  plrDestroy(&right);
+}
+
+/* Write the union of doclists in pLeft and pRight to pOut.  For
+** docids in common between the inputs, the union of the position
+** lists is written.  Inputs and outputs are always type DL_DEFAULT.
+*/
+static void docListUnion(
+  const char *pLeft, int nLeft,
+  const char *pRight, int nRight,
+  DataBuffer *pOut      /* Write the combined doclist here */
+){
+  DLReader left, right;
+  DLWriter writer;
+
+  if( nLeft==0 ){
+    dataBufferAppend(pOut, pRight, nRight);
+    return;
+  }
+  if( nRight==0 ){
+    dataBufferAppend(pOut, pLeft, nLeft);
+    return;
+  }
+
+  dlrInit(&left, DL_DEFAULT, pLeft, nLeft);
+  dlrInit(&right, DL_DEFAULT, pRight, nRight);
+  dlwInit(&writer, DL_DEFAULT, pOut);
+
+  while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){
+    if( dlrAtEnd(&right) ){
+      dlwCopy(&writer, &left);
+      dlrStep(&left);
+    }else if( dlrAtEnd(&left) ){
+      dlwCopy(&writer, &right);
+      dlrStep(&right);
+    }else if( dlrDocid(&left)<dlrDocid(&right) ){
+      dlwCopy(&writer, &left);
+      dlrStep(&left);
+    }else if( dlrDocid(&left)>dlrDocid(&right) ){
+      dlwCopy(&writer, &right);
+      dlrStep(&right);
+    }else{
+      posListUnion(&left, &right, &writer);
+      dlrStep(&left);
+      dlrStep(&right);
+    }
+  }
+
+  dlrDestroy(&left);
+  dlrDestroy(&right);
+  dlwDestroy(&writer);
+}
+
 /* pLeft and pRight are DLReaders positioned to the same docid.
 **
 ** If there are no instances in pLeft or pRight where the position
@ -1230,7 +1356,8 @@ static void docListMerge(DataBuffer *out,
 ** include the positions from pRight that are one more than a
 ** position in pLeft.  In other words:  pRight.iPos==pLeft.iPos+1.
 */
-static void mergePosList(DLReader *pLeft, DLReader *pRight, DLWriter *pOut){
+static void posListPhraseMerge(DLReader *pLeft, DLReader *pRight,
+                               DLWriter *pOut){
  PLReader left, right;
  PLWriter writer;
  int match = 0;
@ -1302,7 +1429,7 @@ static void docListPhraseMerge(
    }else if( dlrDocid(&right)<dlrDocid(&left) ){
      dlrStep(&right);
    }else{
-      mergePosList(&left, &right, &writer);
+      posListPhraseMerge(&left, &right, &writer);
      dlrStep(&left);
      dlrStep(&right);
    }
@ -4757,9 +4884,11 @@ static void leafReaderStep(LeafReader *pReader){
  }
 }

-/* strcmp-style comparison of pReader's current term against pTerm. */
+/* strcmp-style comparison of pReader's current term against pTerm.
+** If isPrefix, equality means equal through nTerm bytes.
+*/
 static int leafReaderTermCmp(LeafReader *pReader,
-                             const char *pTerm, int nTerm){
+                             const char *pTerm, int nTerm, int isPrefix){
  int c, n = pReader->term.nData<nTerm ? pReader->term.nData : nTerm;
  if( n==0 ){
    if( pReader->term.nData>0 ) return -1;
@ -4769,6 +4898,7 @@ static int leafReaderTermCmp(LeafReader *pReader,

  c = memcmp(pReader->term.pData, pTerm, n);
  if( c!=0 ) return c;
+  if( isPrefix && n==nTerm ) return 0;
  return pReader->term.nData - nTerm;
 }

@ -4916,7 +5046,8 @@ static int leavesReaderTermCmp(LeavesReader *lr1, LeavesReader *lr2){
  if( leavesReaderAtEnd(lr2) ) return -1;

  return leafReaderTermCmp(&lr1->leafReader,
-                           leavesReaderTerm(lr2), leavesReaderTermBytes(lr2));
+                           leavesReaderTerm(lr2), leavesReaderTermBytes(lr2),
+                           0);
 }

 /* Similar to leavesReaderTermCmp(), with additional ordering by idx
@ -5105,7 +5236,8 @@ static int segmentMerge(fulltext_vtab *v, int iLevel){
 ** Internal function for loadSegmentLeaf().
 */
 static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader,
-                                const char *pTerm, int nTerm, DataBuffer *out){
+                                const char *pTerm, int nTerm, int isPrefix,
+                                DataBuffer *out){
  assert( nTerm>0 );

  /* Process while the prefix matches. */
@ -5115,14 +5247,25 @@ static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader,
    ** on a better name.  [Meanwhile, break encapsulation rather than
    ** use a confusing name.]
    */
-    int rc, c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm);
+    int rc;
+    int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix);
    if( c==0 ){
      const char *pData = leavesReaderData(pReader);
      int nData = leavesReaderDataBytes(pReader);
-      assert( out->nData==0 );
-      dataBufferReplace(out, pData, nData);
+      if( out->nData==0 ){
+        dataBufferReplace(out, pData, nData);
+      }else{
+        DataBuffer result;
+        dataBufferInit(&result, out->nData+nData);
+        docListUnion(out->pData, out->nData, pData, nData, &result);
+        dataBufferDestroy(out);
+        *out = result;
+        /* TODO(shess) Rather than destroy out, we could retain it for
+        ** later reuse.
+        */
+      }
    }
-    if( c>=0 ) break;      /* Past any possible matches. */
+    if( c>0 ) break;      /* Past any possible matches. */

    rc = leavesReaderStep(v, pReader);
    if( rc!=SQLITE_OK ) return rc;
@ -5132,7 +5275,8 @@ static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader,

 /* Call loadSegmentLeavesInt() with pData/nData as input. */
 static int loadSegmentLeaf(fulltext_vtab *v, const char *pData, int nData,
-                           const char *pTerm, int nTerm, DataBuffer *out){
+                           const char *pTerm, int nTerm, int isPrefix,
+                           DataBuffer *out){
  LeavesReader reader;
  int rc;

@ -5141,7 +5285,7 @@ static int loadSegmentLeaf(fulltext_vtab *v, const char *pData, int nData,
  rc = leavesReaderInit(v, 0, 0, 0, pData, nData, &reader);
  if( rc!=SQLITE_OK ) return rc;

-  rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, out);
+  rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out);
  leavesReaderReset(&reader);
  leavesReaderDestroy(&reader);
  return rc;
@ -5153,7 +5297,8 @@ static int loadSegmentLeaf(fulltext_vtab *v, const char *pData, int nData,
 */
 static int loadSegmentLeaves(fulltext_vtab *v,
                             sqlite_int64 iStartLeaf, sqlite_int64 iEndLeaf,
-                             const char *pTerm, int nTerm, DataBuffer *out){
+                             const char *pTerm, int nTerm, int isPrefix,
+                             DataBuffer *out){
  int rc;
  LeavesReader reader;

@ -5161,7 +5306,7 @@ static int loadSegmentLeaves(fulltext_vtab *v,
  rc = leavesReaderInit(v, 0, iStartLeaf, iEndLeaf, NULL, 0, &reader);
  if( rc!=SQLITE_OK ) return rc;

-  rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, out);
+  rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out);
  leavesReaderReset(&reader);
  leavesReaderDestroy(&reader);
  return rc;
@ -5258,8 +5403,7 @@ static int loadSegmentInt(fulltext_vtab *v, const char *pData, int nData,
                          DataBuffer *out){
  /* Special case where root is a leaf. */
  if( *pData=='\0' ){
-    assert( !isPrefix );   /* TODO(shess) Add prefix support. */
-    return loadSegmentLeaf(v, pData, nData, pTerm, nTerm, out);
+    return loadSegmentLeaf(v, pData, nData, pTerm, nTerm, isPrefix, out);
  }else{
    int rc;
    sqlite_int64 iStartChild, iEndChild;
@ -5290,8 +5434,8 @@ static int loadSegmentInt(fulltext_vtab *v, const char *pData, int nData,
    assert( iStartChild<=iLeavesEnd );
    assert( iEndChild<=iLeavesEnd );

-    assert( !isPrefix );   /* TODO(shess) Add prefix support. */
-    return loadSegmentLeaves(v, iStartChild, iEndChild, pTerm, nTerm, out);
+    return loadSegmentLeaves(v, iStartChild, iEndChild,
+                             pTerm, nTerm, isPrefix, out);
  }
 }

@ -5299,10 +5443,6 @@ static int loadSegmentInt(fulltext_vtab *v, const char *pData, int nData,
 ** merge its doclist over *out (any duplicate doclists read from the
 ** segment rooted at pData will overwrite those in *out).
 */
-/* NOTE(shess) Previous code passed out down to sub-routines for use
-** in docListMerge().  This version deoptimizes things slightly, but
-** prefix searches require a different merge function entirely.
-*/
 static int loadSegment(fulltext_vtab *v, const char *pData, int nData,
                       sqlite_int64 iLeavesEnd,
                       const char *pTerm, int nTerm, int isPrefix,
--- a/14
+++ b/14
@ -1,5 +1,5 @@
-C The\spager\stakes\sthe\ssector\ssize\sto\sbe\sthe\slarger\sof\sthe\ssector\ssize\nreported\sby\ssqlite3OsSectorSize()\sand\sthe\spage\ssize.\s(CVS\s3890)
-D 2007-05-01T16:59:49
+C Modify\sloadSegmentLeavesInt()\sto\scorrectly\shandle\sprefix\ssearching.\nThe\snew\sfunction\sdocListUnion()\sis\sused\sto\saccumulate\sa\sunion\sof\sthe\nhits\sfor\sthe\smatching\sterms,\swhich\swill\sbe\smerged\sacross\ssegments\nusing\sdocListMerge().\s(CVS\s3891)
+D 2007-05-01T17:14:59
 F Makefile.in 8cab54f7c9f5af8f22fd97ddf1ecfd1e1860de62
 F Makefile.linux-gcc 2d8574d1ba75f129aba2019f0b959db380a90935
 F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028
@ -34,7 +34,7 @@ F ext/fts1/fulltext.h 08525a47852d1d62a0be81d3fc3fe2d23b094efd
 F ext/fts1/simple_tokenizer.c 1844d72f7194c3fd3d7e4173053911bf0661b70d
 F ext/fts1/tokenizer.h 0c53421b832366d20d720d21ea3e1f6e66a36ef9
 F ext/fts2/README.txt 8c18f41574404623b76917b9da66fcb0ab38328d
-F ext/fts2/fts2.c c750b2db623587021a402631a7aa582d81852c44
+F ext/fts2/fts2.c a6762b7a6cc173eb83a0aa9506c1b7be66f00786
 F ext/fts2/fts2.h 591916a822cfb6426518fdbf6069359119bc46eb
 F ext/fts2/fts2_hash.c b3f22116d4ef0bc8f2da6e3fdc435c86d0951a9b
 F ext/fts2/fts2_hash.h e283308156018329f042816eb09334df714e105e
@ -466,7 +466,7 @@ F www/tclsqlite.tcl bb0d1357328a42b1993d78573e587c6dcbc964b9
 F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0
 F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b
 F www/whentouse.tcl fc46eae081251c3c181bd79c5faef8195d7991a5
-P cae844a01a1d87ffb00bba8b4e7b62a92e633aa9
-R 195e13614b6e5b993d99e293e129cc37
-U drh
-Z e4fd6258dd404077dd1b05dd5e45d3e2
+P e5e6af55ccc5c1a8a9206b42f1dd7bf547cb97ca
+R 2c25fe5630cee9d287b5318018624ff2
+U shess
+Z bcafc47e1cdd98b2861c3ec5b23e7e20
--- a/manifest.uuid
+++ b/manifest.uuid
@ -1 +1 @@
-e5e6af55ccc5c1a8a9206b42f1dd7bf547cb97ca
+72c796307338c2751a91c30f6fb16989afbf3816