From 91bef41bdeb5c344055e84371770e399946188a4 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Tue, 28 Jul 2020 13:54:33 +0530 Subject: [PATCH 1/3] New norm disk format for zap The current norm encoding consistently makes a footprint of size greater 5bytes on disk irrespective of the actual field length. This size bloating could be saved if we directly encode the field length directly to disk and decode it correctly during the read time. More than 30% reduction in index size is observed with this change The current change also restricts the changes to zap itself. As this fix changes the meaning of data present on disk, it mandates a zap version bump. There could be variations in norm's precison stemming from the floating point precision loss between uint64 to float64 conversions on some higher field length values, but trading it off for the simple conversion logic. Any alternate suggestions appreciated. --- build.go | 2 +- merge.go | 2 +- merge_test.go | 22 ++++++++++++++++++++++ new.go | 6 +++--- posting.go | 18 ++++++++++++++---- 5 files changed, 41 insertions(+), 9 deletions(-) diff --git a/build.go b/build.go index 7a8dce0..1b73183 100644 --- a/build.go +++ b/build.go @@ -22,7 +22,7 @@ import ( "github.com/couchbase/vellum" ) -const Version uint32 = 14 +const Version uint32 = 15 const Type string = "zap" diff --git a/merge.go b/merge.go index 805100f..58b89f9 100644 --- a/merge.go +++ b/merge.go @@ -487,7 +487,7 @@ func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *Po newRoaring.Add(uint32(hitNewDocNum)) nextFreq := next.Frequency() - nextNorm := uint64(math.Float32bits(float32(next.Norm()))) + nextNorm := fieldLenFromNorm(next.Norm()) locs := next.Locations() diff --git a/merge_test.go b/merge_test.go index 4862b27..a8252cd 100644 --- a/merge_test.go +++ b/merge_test.go @@ -870,3 +870,25 @@ func TestUnder32Bits(t *testing.T) { t.Errorf("under32Bits wrong") } } + +func TestEncodeDecodeFieldLenAndNorm(t *testing.T) { + // verifying the floating point correctness for field + // length and norm encode and decode ops up to some + // reasonable field length of 2400. + fieldLensIn := make([]uint64, 2400) + for i := range fieldLensIn { + fieldLensIn[i] = uint64(i + 1) + } + + norms := make([]float64, 2400) + for i := range fieldLensIn { + norms[i] = normFromFieldLen(fieldLensIn[i]) + } + + for i := range norms { + if fieldLenFromNorm(norms[i]) != fieldLensIn[i] { + t.Errorf("Field length for norm: %v, expected: %d, got %d", + norms[i], fieldLensIn[i], fieldLenFromNorm(norms[i])) + } + } +} diff --git a/new.go b/new.go index 9815818..5ce12a1 100644 --- a/new.go +++ b/new.go @@ -210,7 +210,7 @@ type interimStoredField struct { type interimFreqNorm struct { freq uint64 - norm float32 + norm uint64 numLocs int } @@ -456,7 +456,7 @@ func (s *interim) processDocument(docNum uint64, // now that it's been rolled up into fieldTFs, walk that for fieldID, tfs := range fieldTFs { dict := s.Dicts[fieldID] - norm := float32(1.0 / math.Sqrt(float64(fieldLens[fieldID]))) + norm := uint64(fieldLens[fieldID]) for term, tf := range tfs { pid := dict[term] - 1 @@ -669,7 +669,7 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err err = tfEncoder.Add(docNum, encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0), - uint64(math.Float32bits(freqNorm.norm))) + freqNorm.norm) if err != nil { return 0, nil, err } diff --git a/posting.go b/posting.go index 3a6ee54..34a1102 100644 --- a/posting.go +++ b/posting.go @@ -92,7 +92,7 @@ func under32Bits(x uint64) bool { const DocNum1HitFinished = math.MaxUint64 -var NormBits1Hit = uint64(math.Float32bits(float32(1))) +var NormBits1Hit = uint64(1) // PostingsList is an in-memory representation of a postings list type PostingsList struct { @@ -479,7 +479,7 @@ func (i *PostingsIterator) nextAtOrAfter(atOrAfter uint64) (segment.Posting, err return nil, err } - rv.norm = math.Float32frombits(uint32(normBits)) + rv.norm = normFromFieldLen(normBits) if i.includeLocs && hasLocs { // prepare locations into reused slices, where we assume @@ -722,7 +722,7 @@ func PostingsIteratorFrom1Hit(docNum1Hit uint64, type Posting struct { docNum uint64 freq uint64 - norm float32 + norm float64 locs []segment.Location } @@ -748,7 +748,7 @@ func (p *Posting) Frequency() uint64 { // Norm returns the normalization factor for this posting func (p *Posting) Norm() float64 { - return float64(p.norm) + return p.norm } // Locations returns the location information for each occurrence @@ -796,3 +796,13 @@ func (l *Location) Pos() uint64 { func (l *Location) ArrayPositions() []uint64 { return l.ap } + +func fieldLenFromNorm(norm float64) uint64 { + rv := float64(int(1000*1/norm)) / 1000 + return uint64(math.Ceil(rv * rv)) +} + +func normFromFieldLen(fieldLen uint64) float64 { + rv := float32(1.0 / math.Sqrt(float64(fieldLen))) + return float64(rv) +} From ac3f18039ef4348736c9151be80f63f91539bf8d Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 31 Jul 2020 13:51:36 +0530 Subject: [PATCH 2/3] removing the norm decoding logic from zap --- merge.go | 2 +- merge_test.go | 22 ---------------------- new.go | 6 +++--- posting.go | 17 +++-------------- segment_test.go | 2 +- 5 files changed, 8 insertions(+), 41 deletions(-) diff --git a/merge.go b/merge.go index 58b89f9..ffbd4c0 100644 --- a/merge.go +++ b/merge.go @@ -487,7 +487,7 @@ func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *Po newRoaring.Add(uint32(hitNewDocNum)) nextFreq := next.Frequency() - nextNorm := fieldLenFromNorm(next.Norm()) + nextNorm := uint64(next.Norm()) locs := next.Locations() diff --git a/merge_test.go b/merge_test.go index a8252cd..4862b27 100644 --- a/merge_test.go +++ b/merge_test.go @@ -870,25 +870,3 @@ func TestUnder32Bits(t *testing.T) { t.Errorf("under32Bits wrong") } } - -func TestEncodeDecodeFieldLenAndNorm(t *testing.T) { - // verifying the floating point correctness for field - // length and norm encode and decode ops up to some - // reasonable field length of 2400. - fieldLensIn := make([]uint64, 2400) - for i := range fieldLensIn { - fieldLensIn[i] = uint64(i + 1) - } - - norms := make([]float64, 2400) - for i := range fieldLensIn { - norms[i] = normFromFieldLen(fieldLensIn[i]) - } - - for i := range norms { - if fieldLenFromNorm(norms[i]) != fieldLensIn[i] { - t.Errorf("Field length for norm: %v, expected: %d, got %d", - norms[i], fieldLensIn[i], fieldLenFromNorm(norms[i])) - } - } -} diff --git a/new.go b/new.go index 5ce12a1..c10a6a0 100644 --- a/new.go +++ b/new.go @@ -210,7 +210,7 @@ type interimStoredField struct { type interimFreqNorm struct { freq uint64 - norm uint64 + norm float32 numLocs int } @@ -456,7 +456,7 @@ func (s *interim) processDocument(docNum uint64, // now that it's been rolled up into fieldTFs, walk that for fieldID, tfs := range fieldTFs { dict := s.Dicts[fieldID] - norm := uint64(fieldLens[fieldID]) + norm := math.Float32frombits(uint32(fieldLens[fieldID])) for term, tf := range tfs { pid := dict[term] - 1 @@ -669,7 +669,7 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err err = tfEncoder.Add(docNum, encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0), - freqNorm.norm) + uint64(math.Float32bits(freqNorm.norm))) if err != nil { return 0, nil, err } diff --git a/posting.go b/posting.go index 34a1102..832f090 100644 --- a/posting.go +++ b/posting.go @@ -478,8 +478,7 @@ func (i *PostingsIterator) nextAtOrAfter(atOrAfter uint64) (segment.Posting, err if err != nil { return nil, err } - - rv.norm = normFromFieldLen(normBits) + rv.norm = float32(normBits) if i.includeLocs && hasLocs { // prepare locations into reused slices, where we assume @@ -722,7 +721,7 @@ func PostingsIteratorFrom1Hit(docNum1Hit uint64, type Posting struct { docNum uint64 freq uint64 - norm float64 + norm float32 locs []segment.Location } @@ -748,7 +747,7 @@ func (p *Posting) Frequency() uint64 { // Norm returns the normalization factor for this posting func (p *Posting) Norm() float64 { - return p.norm + return float64(p.norm) } // Locations returns the location information for each occurrence @@ -796,13 +795,3 @@ func (l *Location) Pos() uint64 { func (l *Location) ArrayPositions() []uint64 { return l.ap } - -func fieldLenFromNorm(norm float64) uint64 { - rv := float64(int(1000*1/norm)) / 1000 - return uint64(math.Ceil(rv * rv)) -} - -func normFromFieldLen(fieldLen uint64) float64 { - rv := float32(1.0 / math.Sqrt(float64(fieldLen))) - return float64(rv) -} diff --git a/segment_test.go b/segment_test.go index cc69bd8..212a60c 100644 --- a/segment_test.go +++ b/segment_test.go @@ -216,7 +216,7 @@ func TestOpen(t *testing.T) { if nextPosting.Number() != 0 { t.Errorf("expected doc number 0, got %d", nextPosting.Number()) } - expectedNorm := float32(1.0 / math.Sqrt(float64(5))) + expectedNorm := float32(5) if nextPosting.Norm() != float64(expectedNorm) { t.Errorf("expected norm %f, got %f", expectedNorm, nextPosting.Norm()) } From e6acb6144baa67b1dfde099aef4977ecd84d96d6 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Tue, 29 Sep 2020 08:28:11 +0530 Subject: [PATCH 3/3] Adding NormUint64() utility method for the internal norm value access within zap. Processed/decoded value of norm would still be available over the existing Norm() method. --- merge.go | 7 ++++++- posting.go | 10 ++++++++-- segment_test.go | 2 +- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/merge.go b/merge.go index ffbd4c0..1de0deb 100644 --- a/merge.go +++ b/merge.go @@ -487,7 +487,12 @@ func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *Po newRoaring.Add(uint32(hitNewDocNum)) nextFreq := next.Frequency() - nextNorm := uint64(next.Norm()) + var nextNorm uint64 + if pi, ok := next.(*Posting); ok { + nextNorm = pi.NormUint64() + } else { + return 0, 0, 0, nil, fmt.Errorf("unexpected posting type %T", next) + } locs := next.Locations() diff --git a/posting.go b/posting.go index 832f090..53521ab 100644 --- a/posting.go +++ b/posting.go @@ -478,7 +478,8 @@ func (i *PostingsIterator) nextAtOrAfter(atOrAfter uint64) (segment.Posting, err if err != nil { return nil, err } - rv.norm = float32(normBits) + + rv.norm = math.Float32frombits(uint32(normBits)) if i.includeLocs && hasLocs { // prepare locations into reused slices, where we assume @@ -747,7 +748,7 @@ func (p *Posting) Frequency() uint64 { // Norm returns the normalization factor for this posting func (p *Posting) Norm() float64 { - return float64(p.norm) + return float64(float32(1.0 / math.Sqrt(float64(math.Float32bits(p.norm))))) } // Locations returns the location information for each occurrence @@ -755,6 +756,11 @@ func (p *Posting) Locations() []segment.Location { return p.locs } +// NormUint64 returns the norm value as uint64 +func (p *Posting) NormUint64() uint64 { + return uint64(math.Float32bits(p.norm)) +} + // Location represents the location of a single occurrence type Location struct { field string diff --git a/segment_test.go b/segment_test.go index 212a60c..cc69bd8 100644 --- a/segment_test.go +++ b/segment_test.go @@ -216,7 +216,7 @@ func TestOpen(t *testing.T) { if nextPosting.Number() != 0 { t.Errorf("expected doc number 0, got %d", nextPosting.Number()) } - expectedNorm := float32(5) + expectedNorm := float32(1.0 / math.Sqrt(float64(5))) if nextPosting.Norm() != float64(expectedNorm) { t.Errorf("expected norm %f, got %f", expectedNorm, nextPosting.Norm()) }