diff --git a/.gitignore b/.gitignore index ab81965..1d4f234 100644 --- a/.gitignore +++ b/.gitignore @@ -1,16 +1,3 @@ -# Binaries for programs and plugins -*.exe -*.exe~ -*.dll -*.so -*.dylib - -# Test binary, built with `go test -c` -*.test - -# Output of the go coverage tool, specifically when used with LiteIDE -*.out -.idea -# Dependency directories (remove the comment below to include it) -# vendor/ -.vscode \ No newline at end of file +.vscode +work_test +testdata \ No newline at end of file diff --git a/db.go b/db.go index df86de1..b26c08c 100644 --- a/db.go +++ b/db.go @@ -1,55 +1,104 @@ package corekv import ( - "github.com/hardcore-os/corekv/iterator" + "expvar" + "fmt" + "math" + "sync" + "sync/atomic" + "time" + "github.com/hardcore-os/corekv/lsm" "github.com/hardcore-os/corekv/utils" - "github.com/hardcore-os/corekv/utils/codec" - "github.com/hardcore-os/corekv/vlog" + "github.com/pkg/errors" ) type ( // coreKV对外提供的功能集合 CoreAPI interface { - Set(data *codec.Entry) error - Get(key []byte) (*codec.Entry, error) + Set(data *utils.Entry) error + Get(key []byte) (*utils.Entry, error) Del(key []byte) error - NewIterator(opt *iterator.Options) iterator.Iterator + NewIterator(opt *utils.Options) utils.Iterator Info() *Stats Close() error } // DB 对外暴露的接口对象 全局唯一,持有各种资源句柄 DB struct { - opt *Options - lsm *lsm.LSM - vlog *vlog.VLog - stats *Stats + sync.RWMutex + opt *Options + lsm *lsm.LSM + vlog *valueLog + stats *Stats + flushChan chan flushTask // For flushing memtables. + writeCh chan *request + blockWrites int32 + vhead *utils.ValuePtr + logRotates int32 } ) -func Open(options *Options) *DB { - db := &DB{opt: options} - // 初始化LSM结构 - db.lsm = lsm.NewLSM(&lsm.Options{}) +var ( + head = []byte("!corekv!head") // For storing value offset for replay. +) + +/** +SSTableMaxSz: 1024, +MemTableSize: 1024, +BlockSize: 1024, +BloomFalsePositive: 0, +BaseLevelSize: 10 << 20, +LevelSizeMultiplier: 10, +BaseTableSize: 2 << 20, +TableSizeMultiplier: 2, +NumLevelZeroTables: 15, +MaxLevelNum: 7, +NumCompactors: 3, +*/ +// Open DB +// TODO 这里是不是要上一个目录锁比较好,防止多个进程打开同一个目录? +func Open(opt *Options) *DB { + c := utils.NewCloser() + db := &DB{opt: opt} // 初始化vlog结构 - db.vlog = vlog.NewVLog(&vlog.Options{}) + db.initVLog() + // 初始化LSM结构 + db.lsm = lsm.NewLSM(&lsm.Options{ + WorkDir: opt.WorkDir, + MemTableSize: opt.MemTableSize, + SSTableMaxSz: opt.SSTableMaxSz, + BlockSize: 8 * 1024, + BloomFalsePositive: 0, //0.01, + BaseLevelSize: 10 << 20, + LevelSizeMultiplier: 10, + BaseTableSize: 5 << 20, + TableSizeMultiplier: 2, + NumLevelZeroTables: 15, + MaxLevelNum: 7, + NumCompactors: 1, + DiscardStatsCh: &(db.vlog.lfDiscardStats.flushChan), + }) // 初始化统计信息 - db.stats = newStats(options) + db.stats = newStats(opt) // 启动 sstable 的合并压缩过程 - go db.lsm.StartMerge() - // 启动 vlog gc 过程 - go db.vlog.StartGC() + go db.lsm.StartCompacter() + // 准备vlog gc + c.Add(1) + db.writeCh = make(chan *request) + db.flushChan = make(chan flushTask, 16) + go db.doWrites(c) // 启动 info 统计过程 go db.stats.StartStats() return db } func (db *DB) Close() error { + db.vlog.lfDiscardStats.closer.Close() if err := db.lsm.Close(); err != nil { return err } - if err := db.vlog.Close(); err != nil { + if err := db.vlog.close(); err != nil { return err } if err := db.stats.close(); err != nil { @@ -60,48 +109,307 @@ func (db *DB) Close() error { func (db *DB) Del(key []byte) error { // 写入一个值为nil的entry 作为墓碑消息实现删除 - return db.Set(&codec.Entry{ + return db.Set(&utils.Entry{ Key: key, Value: nil, ExpiresAt: 0, }) } -func (db *DB) Set(data *codec.Entry) error { +func (db *DB) Set(data *utils.Entry) error { + if data == nil || len(data.Key) == 0 { + return utils.ErrEmptyKey + } // 做一些必要性的检查 // 如果value 大于一个阈值 则创建值指针,并将其写入vlog中 - var valuePtr *codec.ValuePtr - if utils.ValueSize(data.Value) > db.opt.ValueThreshold { - valuePtr = codec.NewValuePtr(data) - // 先写入vlog不会有事务问题,因为如果lsm写入失败,vlog会在GC阶段清理无效的key - if err := db.vlog.Set(data); err != nil { + var ( + vp *utils.ValuePtr + err error + ) + data.Key = utils.KeyWithTs(data.Key, math.MaxUint32) + // 如果value不应该直接写入LSM 则先写入 vlog文件,这时必须保证vlog具有重放功能 + // 以便于崩溃后恢复数据 + if !db.shouldWriteValueToLSM(data) { + if vp, err = db.vlog.newValuePtr(data); err != nil { return err } - } - // 写入LSM, 如果写值指针不空则替换值entry.value的值 - if valuePtr != nil { - data.Value = codec.ValuePtrCodec(valuePtr) + data.Meta |= utils.BitValuePointer + data.Value = vp.Encode() } return db.lsm.Set(data) } -func (db *DB) Get(key []byte) (*codec.Entry, error) { +func (db *DB) Get(key []byte) (*utils.Entry, error) { + if len(key) == 0 { + return nil, utils.ErrEmptyKey + } var ( - entry *codec.Entry + entry *utils.Entry err error ) - // 检查输入 - // 从内存表中读取数据 - if entry, err = db.lsm.Get(key); err == nil { + key = utils.KeyWithTs(key, math.MaxUint32) + // 从LSM中查询entry,这时不确定entry是不是值指针 + if entry, err = db.lsm.Get(key); err != nil { return entry, err } // 检查从lsm拿到的value是否是value ptr,是则从vlog中拿值 - if entry != nil && codec.IsValuePtr(entry) { - if entry, err = db.vlog.Get(entry); err == nil { - return entry, err + if entry != nil && utils.IsValuePtr(entry) { + var vp utils.ValuePtr + vp.Decode(entry.Value) + result, cb, err := db.vlog.read(&vp) + defer utils.RunCallback(cb) + if err != nil { + return nil, err } + entry.Value = utils.SafeCopy(nil, result) } - return nil, nil + + if isDeletedOrExpired(entry) { + return nil, utils.ErrKeyNotFound + } + return entry, nil +} + +// 判断是否过期 是可删除 +func isDeletedOrExpired(e *utils.Entry) bool { + if e.Value == nil { + return true + } + if e.ExpiresAt == 0 { + return false + } + + return e.ExpiresAt <= uint64(time.Now().Unix()) } + func (db *DB) Info() *Stats { // 读取stats结构,打包数据并返回 return db.stats } + +// RunValueLogGC triggers a value log garbage collection. +func (db *DB) RunValueLogGC(discardRatio float64) error { + if discardRatio >= 1.0 || discardRatio <= 0.0 { + return utils.ErrInvalidRequest + } + // Find head on disk + headKey := utils.KeyWithTs(head, math.MaxUint64) + val, err := db.lsm.Get(headKey) + if err != nil { + if err == utils.ErrKeyNotFound { + val = &utils.Entry{ + Key: headKey, + Value: []byte{}, + } + } else { + return errors.Wrap(err, "Retrieving head from on-disk LSM") + } + } + + // 内部key head 一定是value ptr 不需要检查内容 + var head utils.ValuePtr + if len(val.Value) > 0 { + head.Decode(val.Value) + } + + // Pick a log file and run GC + return db.vlog.runGC(discardRatio, &head) +} + +func (db *DB) shouldWriteValueToLSM(e *utils.Entry) bool { + return int64(len(e.Value)) < db.opt.ValueThreshold +} + +func (db *DB) sendToWriteCh(entries []*utils.Entry) (*request, error) { + if atomic.LoadInt32(&db.blockWrites) == 1 { + return nil, utils.ErrBlockedWrites + } + var count, size int64 + for _, e := range entries { + size += int64(e.EstimateSize(int(db.opt.ValueThreshold))) + count++ + } + if count >= db.opt.MaxBatchCount || size >= db.opt.MaxBatchSize { + return nil, utils.ErrTxnTooBig + } + + // TODO 尝试使用对象复用,后面entry对象也应该使用 + req := requestPool.Get().(*request) + req.reset() + req.Entries = entries + req.Wg.Add(1) + req.IncrRef() // for db write + db.writeCh <- req // Handled in doWrites. + return req, nil +} + +// Check(kv.BatchSet(entries)) +func (db *DB) batchSet(entries []*utils.Entry) error { + req, err := db.sendToWriteCh(entries) + if err != nil { + return err + } + + return req.Wait() +} + +func (db *DB) doWrites(lc *utils.Closer) { + defer lc.Done() + pendingCh := make(chan struct{}, 1) + + writeRequests := func(reqs []*request) { + if err := db.writeRequests(reqs); err != nil { + utils.Err(fmt.Errorf("writeRequests: %v", err)) + } + <-pendingCh + } + + // This variable tracks the number of pending writes. + reqLen := new(expvar.Int) + + reqs := make([]*request, 0, 10) + for { + var r *request + select { + case r = <-db.writeCh: + case <-lc.CloseSignal: + goto closedCase + } + + for { + reqs = append(reqs, r) + reqLen.Set(int64(len(reqs))) + + if len(reqs) >= 3*utils.KVWriteChCapacity { + pendingCh <- struct{}{} // blocking. + goto writeCase + } + + select { + // Either push to pending, or continue to pick from writeCh. + case r = <-db.writeCh: + case pendingCh <- struct{}{}: + goto writeCase + case <-lc.CloseSignal: + goto closedCase + } + } + + closedCase: + // All the pending request are drained. + // Don't close the writeCh, because it has be used in several places. + for { + select { + case r = <-db.writeCh: + reqs = append(reqs, r) + default: + pendingCh <- struct{}{} // Push to pending before doing a write. + writeRequests(reqs) + return + } + } + + writeCase: + go writeRequests(reqs) + reqs = make([]*request, 0, 10) + reqLen.Set(0) + } +} + +// writeRequests is called serially by only one goroutine. +func (db *DB) writeRequests(reqs []*request) error { + if len(reqs) == 0 { + return nil + } + + done := func(err error) { + for _, r := range reqs { + r.Err = err + r.Wg.Done() + } + } + err := db.vlog.write(reqs) + if err != nil { + done(err) + return err + } + var count int + for _, b := range reqs { + if len(b.Entries) == 0 { + continue + } + count += len(b.Entries) + if err != nil { + done(err) + return errors.Wrap(err, "writeRequests") + } + if err := db.writeToLSM(b); err != nil { + done(err) + return errors.Wrap(err, "writeRequests") + } + db.Lock() + db.updateHead(b.Ptrs) + db.Unlock() + } + done(nil) + return nil +} +func (db *DB) writeToLSM(b *request) error { + if len(b.Ptrs) != len(b.Entries) { + return errors.Errorf("Ptrs and Entries don't match: %+v", b) + } + + for i, entry := range b.Entries { + if db.shouldWriteValueToLSM(entry) { // Will include deletion / tombstone case. + entry.Meta = entry.Meta &^ utils.BitValuePointer + } else { + entry.Meta = entry.Meta | utils.BitValuePointer + entry.Value = b.Ptrs[i].Encode() + } + db.lsm.Set(entry) + } + return nil +} +func (req *request) IncrRef() { + atomic.AddInt32(&req.ref, 1) +} + +func (req *request) DecrRef() { + nRef := atomic.AddInt32(&req.ref, -1) + if nRef > 0 { + return + } + req.Entries = nil + requestPool.Put(req) +} + +func (req *request) Wait() error { + req.Wg.Wait() + err := req.Err + req.DecrRef() // DecrRef after writing to DB. + return err +} + +// 结构体 +type flushTask struct { + mt *utils.Skiplist + vptr *utils.ValuePtr + dropPrefixes [][]byte +} + +func (db *DB) pushHead(ft flushTask) error { + // Ensure we never push a zero valued head pointer. + if ft.vptr.IsZero() { + return errors.New("Head should not be zero") + } + + fmt.Printf("Storing value log head: %+v\n", ft.vptr) + val := ft.vptr.Encode() + + // Pick the max commit ts, so in case of crash, our read ts would be higher than all the + // commits. + headTs := utils.KeyWithTs(head, uint64(time.Now().Unix()/1e9)) + ft.mt.Add(&utils.Entry{ + Key: headTs, + Value: val, + }) + return nil +} diff --git a/db_test.go b/db_test.go index 878a953..a73b56f 100644 --- a/db_test.go +++ b/db_test.go @@ -1,33 +1,60 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package corekv import ( - "github.com/hardcore-os/corekv/iterator" - "github.com/hardcore-os/corekv/utils/codec" + "fmt" "testing" "time" + + "github.com/hardcore-os/corekv/utils" ) func TestAPI(t *testing.T) { - opt := NewDefaultOptions() + clearDir() db := Open(opt) defer func() { _ = db.Close() }() // 写入 - e := codec.NewEntry([]byte("hello"), []byte("coreKV")).WithTTL(1 * time.Second) - if err := db.Set(e); err != nil { - t.Fatal(err) + for i := 0; i < 50; i++ { + key, val := fmt.Sprintf("key%d", i), fmt.Sprintf("val%d", i) + e := utils.NewEntry([]byte(key), []byte(val)).WithTTL(1000 * time.Second) + if err := db.Set(e); err != nil { + t.Fatal(err) + } + // 查询 + if entry, err := db.Get([]byte(key)); err != nil { + t.Fatal(err) + } else { + t.Logf("db.Get key=%s, value=%s, expiresAt=%d", entry.Key, entry.Value, entry.ExpiresAt) + } } - // 查询 - if entry, err := db.Get([]byte("hello")); err != nil { - t.Fatal(err) - } else { - t.Logf("db.Get key=%s, value=%s, expiresAt=%d", entry.Key, entry.Value, entry.ExpiresAt) + + for i := 0; i < 40; i++ { + key, _ := fmt.Sprintf("key%d", i), fmt.Sprintf("val%d", i) + if err := db.Del([]byte(key)); err != nil { + t.Fatal(err) + } } + // 迭代器 - iter := db.NewIterator(&iterator.Options{ + iter := db.NewIterator(&utils.Options{ Prefix: []byte("hello"), IsAsc: false, }) defer func() { _ = iter.Close() }() + defer func() { _ = iter.Close() }() for iter.Rewind(); iter.Valid(); iter.Next() { it := iter.Item() t.Logf("db.NewIterator key=%s, value=%s, expiresAt=%d", it.Entry().Key, it.Entry().Value, it.Entry().ExpiresAt) @@ -37,4 +64,19 @@ func TestAPI(t *testing.T) { if err := db.Del([]byte("hello")); err != nil { t.Fatal(err) } + + for i := 0; i < 10; i++ { + key, val := fmt.Sprintf("key%d", i), fmt.Sprintf("val%d", i) + e := utils.NewEntry([]byte(key), []byte(val)).WithTTL(1000 * time.Second) + if err := db.Set(e); err != nil { + t.Fatal(err) + } + // 查询 + if entry, err := db.Get([]byte(key)); err != nil { + t.Fatal(err) + } else { + t.Logf("db.Get key=%s, value=%s, expiresAt=%d", entry.Key, entry.Value, entry.ExpiresAt) + } + } + } diff --git a/debug.sh b/debug.sh new file mode 100755 index 0000000..b6af724 --- /dev/null +++ b/debug.sh @@ -0,0 +1,18 @@ +#!/bin/bash +### + # Copyright 2021 logicrec Project Authors + # + # Licensed under the Apache License, Version 2.0 (the "License") + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. +### + +dlv test -test.run=$1 \ No newline at end of file diff --git a/file/file.go b/file/file.go index f86eae2..588e014 100644 --- a/file/file.go +++ b/file/file.go @@ -1,7 +1,39 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package file +import "io" + +// Options +type Options struct { + FID uint64 + FileName string + Dir string + Path string + Flag int + MaxSz int +} + type CoreFile interface { - Write(b []byte) (n int, err error) - Read(b []byte) (n int, err error) Close() error + Truncature(n int64) error + ReName(name string) error + NewReader(offset int) io.Reader + Bytes(off, sz int) ([]byte, error) + AllocateSlice(sz, offset int) ([]byte, int, error) + Sync() error + Delete() error + Slice(offset int) []byte } diff --git a/file/manifest.go b/file/manifest.go index f64a70c..5e3e4ae 100644 --- a/file/manifest.go +++ b/file/manifest.go @@ -1,56 +1,392 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package file import ( "bufio" - "encoding/csv" + "bytes" + "encoding/binary" + "fmt" + "hash/crc32" "io" + "os" + "path/filepath" + "sync" + "github.com/hardcore-os/corekv/pb" "github.com/hardcore-os/corekv/utils" + "github.com/pkg/errors" ) +// ManifestFile 维护sst文件元信息的文件 +// manifest 比较特殊,不能使用mmap,需要保证实时的写入 +type ManifestFile struct { + opt *Options + f *os.File + lock sync.Mutex + deletionsRewriteThreshold int + manifest *Manifest +} + +// Manifest corekv 元数据状态维护 type Manifest struct { - f CoreFile - tables [][]string // l0-l7 的sst file name + Levels []levelManifest + Tables map[uint64]TableManifest + Creations int + Deletions int +} + +// TableManifest 包含sst的基本信息 +type TableManifest struct { + Level uint8 + Checksum []byte // 方便今后扩展 +} +type levelManifest struct { + Tables map[uint64]struct{} // Set of table id's +} + +//TableMeta sst 的一些元信息 +type TableMeta struct { + ID uint64 + Checksum []byte +} + +// OpenManifestFile 打开manifest文件 +func OpenManifestFile(opt *Options) (*ManifestFile, error) { + path := filepath.Join(opt.Dir, utils.ManifestFilename) + mf := &ManifestFile{lock: sync.Mutex{}, opt: opt} + f, err := os.OpenFile(path, os.O_RDWR, 0) + // 如果打开失败 则尝试创建一个新的 manifest file + if err != nil { + if !os.IsNotExist(err) { + return mf, err + } + m := createManifest() + fp, netCreations, err := helpRewrite(opt.Dir, m) + utils.CondPanic(netCreations == 0, errors.Wrap(err, utils.ErrReWriteFailure.Error())) + if err != nil { + return mf, err + } + mf.f = fp + f = fp + mf.manifest = m + return mf, nil + } + + // 如果打开 则对manifest文件重放 + manifest, truncOffset, err := ReplayManifestFile(f) + if err != nil { + _ = f.Close() + return mf, err + } + // Truncate file so we don't have a half-written entry at the end. + if err := f.Truncate(truncOffset); err != nil { + _ = f.Close() + return mf, err + } + if _, err = f.Seek(0, io.SeekEnd); err != nil { + _ = f.Close() + return mf, err + } + mf.f = f + mf.manifest = manifest + return mf, nil +} + +// ReplayManifestFile 对已经存在的manifest文件重新应用所有状态变更 +func ReplayManifestFile(fp *os.File) (ret *Manifest, truncOffset int64, err error) { + r := &bufReader{reader: bufio.NewReader(fp)} + var magicBuf [8]byte + if _, err := io.ReadFull(r, magicBuf[:]); err != nil { + return &Manifest{}, 0, utils.ErrBadMagic + } + if !bytes.Equal(magicBuf[0:4], utils.MagicText[:]) { + return &Manifest{}, 0, utils.ErrBadMagic + } + version := binary.BigEndian.Uint32(magicBuf[4:8]) + if version != uint32(utils.MagicVersion) { + return &Manifest{}, 0, + fmt.Errorf("manifest has unsupported version: %d (we support %d)", version, utils.MagicVersion) + } + + build := createManifest() + var offset int64 + for { + offset = r.count + var lenCrcBuf [8]byte + _, err := io.ReadFull(r, lenCrcBuf[:]) + if err != nil { + if err == io.EOF || err == io.ErrUnexpectedEOF { + break + } + return &Manifest{}, 0, err + } + length := binary.BigEndian.Uint32(lenCrcBuf[0:4]) + var buf = make([]byte, length) + if _, err := io.ReadFull(r, buf); err != nil { + if err == io.EOF || err == io.ErrUnexpectedEOF { + break + } + return &Manifest{}, 0, err + } + if crc32.Checksum(buf, utils.CastagnoliCrcTable) != binary.BigEndian.Uint32(lenCrcBuf[4:8]) { + return &Manifest{}, 0, utils.ErrBadChecksum + } + + var changeSet pb.ManifestChangeSet + if err := changeSet.Unmarshal(buf); err != nil { + return &Manifest{}, 0, err + } + + if err := applyChangeSet(build, &changeSet); err != nil { + return &Manifest{}, 0, err + } + } + + return build, offset, err +} + +// This is not a "recoverable" error -- opening the KV store fails because the MANIFEST file is +// just plain broken. +func applyChangeSet(build *Manifest, changeSet *pb.ManifestChangeSet) error { + for _, change := range changeSet.Changes { + if err := applyManifestChange(build, change); err != nil { + return err + } + } + return nil +} + +func applyManifestChange(build *Manifest, tc *pb.ManifestChange) error { + switch tc.Op { + case pb.ManifestChange_CREATE: + if _, ok := build.Tables[tc.Id]; ok { + return fmt.Errorf("MANIFEST invalid, table %d exists", tc.Id) + } + build.Tables[tc.Id] = TableManifest{ + Level: uint8(tc.Level), + Checksum: append([]byte{}, tc.Checksum...), + } + for len(build.Levels) <= int(tc.Level) { + build.Levels = append(build.Levels, levelManifest{make(map[uint64]struct{})}) + } + build.Levels[tc.Level].Tables[tc.Id] = struct{}{} + build.Creations++ + case pb.ManifestChange_DELETE: + tm, ok := build.Tables[tc.Id] + if !ok { + return fmt.Errorf("MANIFEST removes non-existing table %d", tc.Id) + } + delete(build.Levels[tm.Level].Tables, tc.Id) + delete(build.Tables, tc.Id) + build.Deletions++ + default: + return fmt.Errorf("MANIFEST file has invalid manifestChange op") + } + return nil +} + +func createManifest() *Manifest { + levels := make([]levelManifest, 0) + return &Manifest{ + Levels: levels, + Tables: make(map[uint64]TableManifest), + } +} + +type bufReader struct { + reader *bufio.Reader + count int64 +} + +func (r *bufReader) Read(p []byte) (n int, err error) { + n, err = r.reader.Read(p) + r.count += int64(n) + return +} + +// asChanges returns a sequence of changes that could be used to recreate the Manifest in its +// present state. +func (m *Manifest) asChanges() []*pb.ManifestChange { + changes := make([]*pb.ManifestChange, 0, len(m.Tables)) + for id, tm := range m.Tables { + changes = append(changes, newCreateChange(id, int(tm.Level), tm.Checksum)) + } + return changes +} +func newCreateChange(id uint64, level int, checksum []byte) *pb.ManifestChange { + return &pb.ManifestChange{ + Id: id, + Op: pb.ManifestChange_CREATE, + Level: uint32(level), + Checksum: checksum, + } +} + +// Must be called while appendLock is held. +func (mf *ManifestFile) rewrite() error { + // In Windows the files should be closed before doing a Rename. + if err := mf.f.Close(); err != nil { + return err + } + fp, nextCreations, err := helpRewrite(mf.opt.Dir, mf.manifest) + if err != nil { + return err + } + mf.manifest.Creations = nextCreations + mf.manifest.Deletions = 0 + mf.f = fp + return nil +} + +func helpRewrite(dir string, m *Manifest) (*os.File, int, error) { + rewritePath := filepath.Join(dir, utils.ManifestRewriteFilename) + // We explicitly sync. + fp, err := os.OpenFile(rewritePath, utils.DefaultFileFlag, utils.DefaultFileMode) + if err != nil { + return nil, 0, err + } + + buf := make([]byte, 8) + copy(buf[0:4], utils.MagicText[:]) + binary.BigEndian.PutUint32(buf[4:8], uint32(utils.MagicVersion)) + + netCreations := len(m.Tables) + changes := m.asChanges() + set := pb.ManifestChangeSet{Changes: changes} + + changeBuf, err := set.Marshal() + if err != nil { + fp.Close() + return nil, 0, err + } + var lenCrcBuf [8]byte + binary.BigEndian.PutUint32(lenCrcBuf[0:4], uint32(len(changeBuf))) + binary.BigEndian.PutUint32(lenCrcBuf[4:8], crc32.Checksum(changeBuf, utils.CastagnoliCrcTable)) + buf = append(buf, lenCrcBuf[:]...) + buf = append(buf, changeBuf...) + if _, err := fp.Write(buf); err != nil { + fp.Close() + return nil, 0, err + } + if err := fp.Sync(); err != nil { + fp.Close() + return nil, 0, err + } + + // In Windows the files should be closed before doing a Rename. + if err = fp.Close(); err != nil { + return nil, 0, err + } + manifestPath := filepath.Join(dir, utils.ManifestFilename) + if err := os.Rename(rewritePath, manifestPath); err != nil { + return nil, 0, err + } + fp, err = os.OpenFile(manifestPath, utils.DefaultFileFlag, utils.DefaultFileMode) + if err != nil { + return nil, 0, err + } + if _, err := fp.Seek(0, io.SeekEnd); err != nil { + fp.Close() + return nil, 0, err + } + if err := utils.SyncDir(dir); err != nil { + fp.Close() + return nil, 0, err + } + + return fp, netCreations, nil } -// WalFile -func (mf *Manifest) Close() error { +// Close 关闭文件 +func (mf *ManifestFile) Close() error { if err := mf.f.Close(); err != nil { return err } return nil } -// Tables 获取table的list -func (mf *Manifest) Tables() [][]string { - return mf.tables +// AddChanges 对外暴露的写比那更丰富 +func (mf *ManifestFile) AddChanges(changesParam []*pb.ManifestChange) error { + return mf.addChanges(changesParam) } +func (mf *ManifestFile) addChanges(changesParam []*pb.ManifestChange) error { + changes := pb.ManifestChangeSet{Changes: changesParam} + buf, err := changes.Marshal() + if err != nil { + return err + } -// OpenManifest -func OpenManifest(opt *Options) *Manifest { - mf := &Manifest{ - f: OpenMockFile(opt), - tables: make([][]string, utils.MaxLevelNum), + // TODO 锁粒度可以优化 + mf.lock.Lock() + defer mf.lock.Unlock() + if err := applyChangeSet(mf.manifest, &changes); err != nil { + return err } - reader := csv.NewReader(bufio.NewReader(mf.f)) - level := 0 - for { - if level > utils.MaxLevelNum { - break + // Rewrite manifest if it'd shrink by 1/10 and it's big enough to care + if mf.manifest.Deletions > utils.ManifestDeletionsRewriteThreshold && + mf.manifest.Deletions > utils.ManifestDeletionsRatio*(mf.manifest.Creations-mf.manifest.Deletions) { + if err := mf.rewrite(); err != nil { + return err } - line, err := reader.Read() - if err == io.EOF { - break - } else if err != nil { - panic(err) + } else { + var lenCrcBuf [8]byte + binary.BigEndian.PutUint32(lenCrcBuf[0:4], uint32(len(buf))) + binary.BigEndian.PutUint32(lenCrcBuf[4:8], crc32.Checksum(buf, utils.CastagnoliCrcTable)) + buf = append(lenCrcBuf[:], buf...) + if _, err := mf.f.Write(buf); err != nil { + return err } - if len(mf.tables[level]) == 0 { - mf.tables[level] = make([]string, len(line)) + } + err = mf.f.Sync() + return err +} + +// AddTableMeta 存储level表到manifest的level中 +func (mf *ManifestFile) AddTableMeta(levelNum int, t *TableMeta) (err error) { + mf.addChanges([]*pb.ManifestChange{ + newCreateChange(t.ID, levelNum, t.Checksum), + }) + return err +} + +// RevertToManifest checks that all necessary table files exist and removes all table files not +// referenced by the manifest. idMap is a set of table file id's that were read from the directory +// listing. +func (mf *ManifestFile) RevertToManifest(idMap map[uint64]struct{}) error { + // 1. Check all files in manifest exist. + for id := range mf.manifest.Tables { + if _, ok := idMap[id]; !ok { + return fmt.Errorf("file does not exist for table %d", id) } - for j, tableName := range line { - mf.tables[level][j] = tableName + } + + // 2. Delete files that shouldn't exist. + for id := range idMap { + if _, ok := mf.manifest.Tables[id]; !ok { + utils.Err(fmt.Errorf("Table file %d not referenced in MANIFEST", id)) + filename := utils.FileNameSSTable(mf.opt.Dir, id) + if err := os.Remove(filename); err != nil { + return errors.Wrapf(err, "While removing table %d", id) + } } - level++ } - return mf + return nil +} + +// GetManifest manifest +func (mf *ManifestFile) GetManifest() *Manifest { + return mf.manifest } diff --git a/file/mmap_darwin.go b/file/mmap_darwin.go new file mode 100644 index 0000000..4ff3a10 --- /dev/null +++ b/file/mmap_darwin.go @@ -0,0 +1,254 @@ +//go:build darwin +// +build darwin + +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package file + +import ( + "encoding/binary" + "fmt" + "io" + "os" + "path/filepath" + + "github.com/hardcore-os/corekv/utils/mmap" + "github.com/pkg/errors" +) + +// MmapFile represents an mmapd file and includes both the buffer to the data and the file descriptor. +type MmapFile struct { + Data []byte + Fd *os.File +} + +// OpenMmapFileUsing os +func OpenMmapFileUsing(fd *os.File, sz int, writable bool) (*MmapFile, error) { + filename := fd.Name() + fi, err := fd.Stat() + if err != nil { + return nil, errors.Wrapf(err, "cannot stat file: %s", filename) + } + + var rerr error + fileSize := fi.Size() + if sz > 0 && fileSize == 0 { + // If file is empty, truncate it to sz. + if err := fd.Truncate(int64(sz)); err != nil { + return nil, errors.Wrapf(err, "error while truncation") + } + fileSize = int64(sz) + } + + // fmt.Printf("Mmaping file: %s with writable: %v filesize: %d\n", fd.Name(), writable, fileSize) + buf, err := mmap.Mmap(fd, writable, fileSize) // Mmap up to file size. + if err != nil { + return nil, errors.Wrapf(err, "while mmapping %s with size: %d", fd.Name(), fileSize) + } + + if fileSize == 0 { + dir, _ := filepath.Split(filename) + go SyncDir(dir) + } + return &MmapFile{ + Data: buf, + Fd: fd, + }, rerr +} + +// OpenMmapFile opens an existing file or creates a new file. If the file is +// created, it would truncate the file to maxSz. In both cases, it would mmap +// the file to maxSz and returned it. In case the file is created, z.NewFile is +// returned. +func OpenMmapFile(filename string, flag int, maxSz int) (*MmapFile, error) { + // fmt.Printf("opening file %s with flag: %v\n", filename, flag) + fd, err := os.OpenFile(filename, flag, 0666) + if err != nil { + return nil, errors.Wrapf(err, "unable to open: %s", filename) + } + writable := true + if flag == os.O_RDONLY { + writable = false + } + return OpenMmapFileUsing(fd, maxSz, writable) +} + +type mmapReader struct { + Data []byte + offset int +} + +func (mr *mmapReader) Read(buf []byte) (int, error) { + if mr.offset > len(mr.Data) { + return 0, io.EOF + } + n := copy(buf, mr.Data[mr.offset:]) + mr.offset += n + if n < len(buf) { + return n, io.EOF + } + return n, nil +} + +func (m *MmapFile) NewReader(offset int) io.Reader { + return &mmapReader{ + Data: m.Data, + offset: offset, + } +} + +// Bytes returns data starting from offset off of size sz. If there's not enough data, it would +// return nil slice and io.EOF. +func (m *MmapFile) Bytes(off, sz int) ([]byte, error) { + if len(m.Data[off:]) < sz { + return nil, io.EOF + } + return m.Data[off : off+sz], nil +} + +// Slice returns the slice at the given offset. +func (m *MmapFile) Slice(offset int) []byte { + sz := binary.BigEndian.Uint32(m.Data[offset:]) + start := offset + 4 + next := start + int(sz) + if next > len(m.Data) { + return []byte{} + } + res := m.Data[start:next] + return res +} + +// AllocateSlice allocates a slice of the given size at the given offset. +func (m *MmapFile) AllocateSlice(sz, offset int) ([]byte, int, error) { + start := offset + 4 + + // If the file is too small, double its size or increase it by 1GB, whichever is smaller. + if start+sz > len(m.Data) { + const oneGB = 1 << 30 + growBy := len(m.Data) + if growBy > oneGB { + growBy = oneGB + } + if growBy < sz+4 { + growBy = sz + 4 + } + if err := m.Truncature(int64(len(m.Data) + growBy)); err != nil { + return nil, 0, err + } + } + + binary.BigEndian.PutUint32(m.Data[offset:], uint32(sz)) + return m.Data[start : start+sz], start + sz, nil +} + +const oneGB = 1 << 30 + +// AppendBuffer 向内存中追加一个buffer,如果空间不足则重新映射,扩大空间 +func (m *MmapFile) AppendBuffer(offset uint32, buf []byte) error { + size := len(m.Data) + needSize := len(buf) + end := int(offset) + needSize + if end > size { + growBy := size + if growBy > oneGB { + growBy = oneGB + } + if growBy < needSize { + growBy = needSize + } + if err := m.Truncature(int64(end)); err != nil { + return err + } + } + dLen := copy(m.Data[offset:end], buf) + if dLen != needSize { + return errors.Errorf("dLen != needSize AppendBuffer failed") + } + return nil +} + +func (m *MmapFile) Sync() error { + if m == nil { + return nil + } + return mmap.Msync(m.Data) +} + +func (m *MmapFile) Delete() error { + if m.Fd == nil { + return nil + } + + if err := mmap.Munmap(m.Data); err != nil { + return fmt.Errorf("while munmap file: %s, error: %v\n", m.Fd.Name(), err) + } + m.Data = nil + if err := m.Fd.Truncate(0); err != nil { + return fmt.Errorf("while truncate file: %s, error: %v\n", m.Fd.Name(), err) + } + if err := m.Fd.Close(); err != nil { + return fmt.Errorf("while close file: %s, error: %v\n", m.Fd.Name(), err) + } + return os.Remove(m.Fd.Name()) +} + +// Close would close the file. It would also truncate the file if maxSz >= 0. +func (m *MmapFile) Close() error { + if m.Fd == nil { + return nil + } + if err := m.Sync(); err != nil { + return fmt.Errorf("while sync file: %s, error: %v\n", m.Fd.Name(), err) + } + if err := mmap.Munmap(m.Data); err != nil { + return fmt.Errorf("while munmap file: %s, error: %v\n", m.Fd.Name(), err) + } + return m.Fd.Close() +} + +func SyncDir(dir string) error { + df, err := os.Open(dir) + if err != nil { + return errors.Wrapf(err, "while opening %s", dir) + } + if err := df.Sync(); err != nil { + return errors.Wrapf(err, "while syncing %s", dir) + } + if err := df.Close(); err != nil { + return errors.Wrapf(err, "while closing %s", dir) + } + return nil +} + +// Truncature 兼容接口 +func (m *MmapFile) Truncature(maxSz int64) error { + if err := m.Sync(); err != nil { + return fmt.Errorf("while sync file: %s, error: %v\n", m.Fd.Name(), err) + } + if err := mmap.Munmap(m.Data); err != nil { + return fmt.Errorf("while munmap file: %s, error: %v\n", m.Fd.Name(), err) + } + if err := m.Fd.Truncate(maxSz); err != nil { + return fmt.Errorf("while truncate file: %s, error: %v\n", m.Fd.Name(), err) + } + var err error + m.Data, err = mmap.Mmap(m.Fd, true, maxSz) // Mmap up to max size. + return err +} + +// ReName 兼容接口 +func (m *MmapFile) ReName(name string) error { + return nil +} diff --git a/file/mmap_linux.go b/file/mmap_linux.go new file mode 100644 index 0000000..fb2f1b3 --- /dev/null +++ b/file/mmap_linux.go @@ -0,0 +1,255 @@ +// +build linux + +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package file + +import ( + "encoding/binary" + "fmt" + "io" + "os" + "path/filepath" + + "github.com/hardcore-os/corekv/utils/mmap" + "github.com/pkg/errors" +) + +// MmapFile represents an mmapd file and includes both the buffer to the data and the file descriptor. +type MmapFile struct { + Data []byte + Fd *os.File +} + +// OpenMmapFileUsing os +func OpenMmapFileUsing(fd *os.File, sz int, writable bool) (*MmapFile, error) { + filename := fd.Name() + fi, err := fd.Stat() + if err != nil { + return nil, errors.Wrapf(err, "cannot stat file: %s", filename) + } + + var rerr error + fileSize := fi.Size() + if sz > 0 && fileSize == 0 { + // If file is empty, truncate it to sz. + if err := fd.Truncate(int64(sz)); err != nil { + return nil, errors.Wrapf(err, "error while truncation") + } + fileSize = int64(sz) + } + + // fmt.Printf("Mmaping file: %s with writable: %v filesize: %d\n", fd.Name(), writable, fileSize) + buf, err := mmap.Mmap(fd, writable, fileSize) // Mmap up to file size. + if err != nil { + return nil, errors.Wrapf(err, "while mmapping %s with size: %d", fd.Name(), fileSize) + } + + if fileSize == 0 { + dir, _ := filepath.Split(filename) + go SyncDir(dir) + } + return &MmapFile{ + Data: buf, + Fd: fd, + }, rerr +} + +// OpenMmapFile opens an existing file or creates a new file. If the file is +// created, it would truncate the file to maxSz. In both cases, it would mmap +// the file to maxSz and returned it. In case the file is created, z.NewFile is +// returned. +func OpenMmapFile(filename string, flag int, maxSz int) (*MmapFile, error) { + // fmt.Printf("opening file %s with flag: %v\n", filename, flag) + fd, err := os.OpenFile(filename, flag, 0666) + if err != nil { + return nil, errors.Wrapf(err, "unable to open: %s", filename) + } + writable := true + if flag == os.O_RDONLY { + writable = false + } + // 如果 sst文件层被打开过,则使用其文件原来的大小 + if fileInfo, err := fd.Stat(); err == nil && fileInfo != nil && fileInfo.Size() > 0 { + maxSz = int(fileInfo.Size()) + } + return OpenMmapFileUsing(fd, maxSz, writable) +} + +type mmapReader struct { + Data []byte + offset int +} + +func (mr *mmapReader) Read(buf []byte) (int, error) { + if mr.offset > len(mr.Data) { + return 0, io.EOF + } + n := copy(buf, mr.Data[mr.offset:]) + mr.offset += n + if n < len(buf) { + return n, io.EOF + } + return n, nil +} + +func (m *MmapFile) NewReader(offset int) io.Reader { + return &mmapReader{ + Data: m.Data, + offset: offset, + } +} + +// Bytes returns data starting from offset off of size sz. If there's not enough data, it would +// return nil slice and io.EOF. +func (m *MmapFile) Bytes(off, sz int) ([]byte, error) { + if len(m.Data[off:]) < sz { + return nil, io.EOF + } + return m.Data[off : off+sz], nil +} + +// Slice returns the slice at the given offset. +func (m *MmapFile) Slice(offset int) []byte { + sz := binary.BigEndian.Uint32(m.Data[offset:]) + start := offset + 4 + next := start + int(sz) + if next > len(m.Data) { + return []byte{} + } + res := m.Data[start:next] + return res +} + +// AllocateSlice allocates a slice of the given size at the given offset. +func (m *MmapFile) AllocateSlice(sz, offset int) ([]byte, int, error) { + start := offset + 4 + + // If the file is too small, double its size or increase it by 1GB, whichever is smaller. + if start+sz > len(m.Data) { + const oneGB = 1 << 30 + growBy := len(m.Data) + if growBy > oneGB { + growBy = oneGB + } + if growBy < sz+4 { + growBy = sz + 4 + } + if err := m.Truncature(int64(len(m.Data) + growBy)); err != nil { + return nil, 0, err + } + } + + binary.BigEndian.PutUint32(m.Data[offset:], uint32(sz)) + return m.Data[start : start+sz], start + sz, nil +} + +const oneGB = 1 << 30 + +// AppendBuffer 向内存中追加一个buffer,如果空间不足则重新映射,扩大空间 +func (m *MmapFile) AppendBuffer(offset uint32, buf []byte) error { + size := len(m.Data) + needSize := len(buf) + end := int(offset) + needSize + if end > size { + growBy := size + if growBy > oneGB { + growBy = oneGB + } + if growBy < needSize { + growBy = needSize + } + if err := m.Truncature(int64(end)); err != nil { + return err + } + } + dLen := copy(m.Data[offset:end], buf) + if dLen != needSize { + return errors.Errorf("dLen != needSize AppendBuffer failed") + } + return nil +} + +func (m *MmapFile) Sync() error { + if m == nil { + return nil + } + return mmap.Msync(m.Data) +} + +func (m *MmapFile) Delete() error { + if m.Fd == nil { + return nil + } + + if err := mmap.Munmap(m.Data); err != nil { + return fmt.Errorf("while munmap file: %s, error: %v\n", m.Fd.Name(), err) + } + m.Data = nil + if err := m.Fd.Truncate(0); err != nil { + return fmt.Errorf("while truncate file: %s, error: %v\n", m.Fd.Name(), err) + } + if err := m.Fd.Close(); err != nil { + return fmt.Errorf("while close file: %s, error: %v\n", m.Fd.Name(), err) + } + return os.Remove(m.Fd.Name()) +} + +// Close would close the file. It would also truncate the file if maxSz >= 0. +func (m *MmapFile) Close() error { + if m.Fd == nil { + return nil + } + if err := m.Sync(); err != nil { + return fmt.Errorf("while sync file: %s, error: %v\n", m.Fd.Name(), err) + } + if err := mmap.Munmap(m.Data); err != nil { + return fmt.Errorf("while munmap file: %s, error: %v\n", m.Fd.Name(), err) + } + return m.Fd.Close() +} + +func SyncDir(dir string) error { + df, err := os.Open(dir) + if err != nil { + return errors.Wrapf(err, "while opening %s", dir) + } + if err := df.Sync(); err != nil { + return errors.Wrapf(err, "while syncing %s", dir) + } + if err := df.Close(); err != nil { + return errors.Wrapf(err, "while closing %s", dir) + } + return nil +} + +// Truncature 兼容接口 +func (m *MmapFile) Truncature(maxSz int64) error { + if err := m.Sync(); err != nil { + return fmt.Errorf("while sync file: %s, error: %v\n", m.Fd.Name(), err) + } + if err := m.Fd.Truncate(maxSz); err != nil { + return fmt.Errorf("while truncate file: %s, error: %v\n", m.Fd.Name(), err) + } + + var err error + m.Data, err = mmap.Mremap(m.Data, int(maxSz)) // Mmap up to max size. + return err +} + +// ReName 兼容接口 +func (m *MmapFile) ReName(name string) error { + return nil +} diff --git a/file/mock.go b/file/mock.go deleted file mode 100644 index 10509aa..0000000 --- a/file/mock.go +++ /dev/null @@ -1,43 +0,0 @@ -package file - -import ( - "fmt" - "os" - - "github.com/hardcore-os/corekv/utils" -) - -// MockFile -type MockFile struct { - f *os.File -} - -// Close -func (lf *MockFile) Close() error { - if err := lf.f.Close(); err != nil { - return err - } - return nil -} - -func (lf *MockFile) Write(bytes []byte) (int, error) { - return lf.f.Write(bytes) -} -func (lf *MockFile) Read(bytes []byte) (int, error) { - return lf.f.Read(bytes) -} - -// Options -type Options struct { - Name string - Dir string -} - -// OpenMockFile mock 文件 -func OpenMockFile(opt *Options) *MockFile { - var err error - lf := &MockFile{} - lf.f, err = os.Open(fmt.Sprintf("%s/%s", opt.Dir, opt.Name)) - utils.Panic(err) - return lf -} diff --git a/file/sstable.go b/file/sstable.go deleted file mode 100644 index 2a5f5aa..0000000 --- a/file/sstable.go +++ /dev/null @@ -1,41 +0,0 @@ -package file - -import ( - "encoding/json" - "io/ioutil" - - "github.com/hardcore-os/corekv/utils" -) - -// SSTable 文件的内存封装 -type SSTable struct { - f *MockFile - indexs []byte - fid string -} - -// OpenSStable 打开一个 sst文件 -func OpenSStable(opt *Options) *SSTable { - return &SSTable{f: OpenMockFile(opt), fid: utils.FID(opt.Name)} -} - -// Indexs 获取sst文件索引 -func (ss *SSTable) Indexs() []byte { - if len(ss.indexs) == 0 { - bv, _ := ioutil.ReadAll(ss.f) - m := make(map[string]interface{}, 0) - json.Unmarshal(bv, &m) - if idx, ok := m["idx"]; !ok { - panic("sst idx is nil") - } else { - dataStr, _ := idx.(string) // hello,0 - ss.indexs = []byte(dataStr) - } - } - return ss.indexs -} - -// FID 获取fid -func (ss *SSTable) FID() string { - return ss.fid -} diff --git a/file/sstable_darwin.go b/file/sstable_darwin.go new file mode 100644 index 0000000..2540e4e --- /dev/null +++ b/file/sstable_darwin.go @@ -0,0 +1,196 @@ +// +build darwin + +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package file + +import ( + "io" + "os" + "sync" + "syscall" + "time" + + "github.com/golang/protobuf/proto" + "github.com/hardcore-os/corekv/pb" + "github.com/hardcore-os/corekv/utils" + "github.com/pkg/errors" +) + +// SSTable 文件的内存封装 +type SSTable struct { + lock *sync.RWMutex + f *MmapFile + maxKey []byte + minKey []byte + idxTables *pb.TableIndex + hasBloomFilter bool + idxLen int + idxStart int + fid uint64 + createdAt time.Time +} + +// OpenSStable 打开一个 sst文件 +func OpenSStable(opt *Options) *SSTable { + omf, err := OpenMmapFile(opt.FileName, os.O_CREATE|os.O_RDWR, opt.MaxSz) + utils.Err(err) + return &SSTable{f: omf, fid: opt.FID, lock: &sync.RWMutex{}} +} + +// Init 初始化 +func (ss *SSTable) Init() error { + var ko *pb.BlockOffset + var err error + if ko, err = ss.initTable(); err != nil { + return err + } + // 从文件中获取创建时间 + stat, _ := ss.f.Fd.Stat() + statType := stat.Sys().(*syscall.Stat_t) + ss.createdAt = time.Unix(statType.Atimespec.Sec, statType.Atimespec.Nsec) + // init min key + keyBytes := ko.GetKey() + minKey := make([]byte, len(keyBytes)) + copy(minKey, keyBytes) + ss.minKey = minKey + ss.maxKey = minKey + return nil +} + +// SetMaxKey max 需要使用table的迭代器,来获取最后一个block的最后一个key +func (ss *SSTable) SetMaxKey(maxKey []byte) { + ss.maxKey = maxKey +} +func (ss *SSTable) initTable() (bo *pb.BlockOffset, err error) { + readPos := len(ss.f.Data) + + // Read checksum len from the last 4 bytes. + readPos -= 4 + buf := ss.readCheckError(readPos, 4) + checksumLen := int(utils.BytesToU32(buf)) + if checksumLen < 0 { + return nil, errors.New("checksum length less than zero. Data corrupted") + } + + // Read checksum. + readPos -= checksumLen + expectedChk := ss.readCheckError(readPos, checksumLen) + + // Read index size from the footer. + readPos -= 4 + buf = ss.readCheckError(readPos, 4) + ss.idxLen = int(utils.BytesToU32(buf)) + + // Read index. + readPos -= ss.idxLen + ss.idxStart = readPos + data := ss.readCheckError(readPos, ss.idxLen) + if err := utils.VerifyChecksum(data, expectedChk); err != nil { + return nil, errors.Wrapf(err, "failed to verify checksum for table: %s", ss.f.Fd.Name()) + } + indexTable := &pb.TableIndex{} + if err := proto.Unmarshal(data, indexTable); err != nil { + return nil, err + } + ss.idxTables = indexTable + + ss.hasBloomFilter = len(indexTable.BloomFilter) > 0 + if len(indexTable.GetOffsets()) > 0 { + return indexTable.GetOffsets()[0], nil + } + return nil, errors.New("read index fail, offset is nil") +} + +// Close 关闭 +func (ss *SSTable) Close() error { + return ss.f.Close() +} + +// Indexs _ +func (ss *SSTable) Indexs() *pb.TableIndex { + return ss.idxTables +} + +// MaxKey 当前最大的key +func (ss *SSTable) MaxKey() []byte { + return ss.maxKey +} + +// MinKey 当前最小的key +func (ss *SSTable) MinKey() []byte { + return ss.minKey +} + +// FID 获取fid +func (ss *SSTable) FID() uint64 { + return ss.fid +} + +// HasBloomFilter _ +func (ss *SSTable) HasBloomFilter() bool { + return ss.hasBloomFilter +} + +func (ss *SSTable) read(off, sz int) ([]byte, error) { + if len(ss.f.Data) > 0 { + if len(ss.f.Data[off:]) < sz { + return nil, io.EOF + } + return ss.f.Data[off : off+sz], nil + } + + res := make([]byte, sz) + _, err := ss.f.Fd.ReadAt(res, int64(off)) + return res, err +} +func (ss *SSTable) readCheckError(off, sz int) []byte { + buf, err := ss.read(off, sz) + utils.Panic(err) + return buf +} + +// Bytes returns data starting from offset off of size sz. If there's not enough data, it would +// return nil slice and io.EOF. +func (ss *SSTable) Bytes(off, sz int) ([]byte, error) { + return ss.f.Bytes(off, sz) +} + +// Size 返回底层文件的尺寸 +func (ss *SSTable) Size() int64 { + fileStats, err := ss.f.Fd.Stat() + utils.Panic(err) + return fileStats.Size() +} + +// GetCreatedAt _ +func (ss *SSTable) GetCreatedAt() *time.Time { + return &ss.createdAt +} + +// SetCreatedAt _ +func (ss *SSTable) SetCreatedAt(t *time.Time) { + ss.createdAt = *t +} + +// Detele _ +func (ss *SSTable) Detele() error { + return ss.f.Delete() +} + +// Truncature _ +func (ss *SSTable) Truncature(size int64) error { + return ss.f.Truncature(size) +} diff --git a/file/sstable_linux.go b/file/sstable_linux.go new file mode 100644 index 0000000..fd63f9e --- /dev/null +++ b/file/sstable_linux.go @@ -0,0 +1,196 @@ +// +build linux + +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package file + +import ( + "io" + "os" + "sync" + "syscall" + "time" + + "github.com/golang/protobuf/proto" + "github.com/hardcore-os/corekv/pb" + "github.com/hardcore-os/corekv/utils" + "github.com/pkg/errors" +) + +// SSTable 文件的内存封装 +type SSTable struct { + lock *sync.RWMutex + f *MmapFile + maxKey []byte + minKey []byte + idxTables *pb.TableIndex + hasBloomFilter bool + idxLen int + idxStart int + fid uint64 + createdAt time.Time +} + +// OpenSStable 打开一个 sst文件 +func OpenSStable(opt *Options) *SSTable { + omf, err := OpenMmapFile(opt.FileName, os.O_CREATE|os.O_RDWR, opt.MaxSz) + utils.Err(err) + return &SSTable{f: omf, fid: opt.FID, lock: &sync.RWMutex{}} +} + +// Init 初始化 +func (ss *SSTable) Init() error { + var ko *pb.BlockOffset + var err error + if ko, err = ss.initTable(); err != nil { + return err + } + // 从文件中获取创建时间 + stat, _ := ss.f.Fd.Stat() + statType := stat.Sys().(*syscall.Stat_t) + ss.createdAt = time.Unix(statType.Ctim.Sec, statType.Ctim.Nsec) + // init min key + keyBytes := ko.GetKey() + minKey := make([]byte, len(keyBytes)) + copy(minKey, keyBytes) + ss.minKey = minKey + ss.maxKey = minKey + return nil +} + +// SetMaxKey max 需要使用table的迭代器,来获取最后一个block的最后一个key +func (ss *SSTable) SetMaxKey(maxKey []byte) { + ss.maxKey = maxKey +} +func (ss *SSTable) initTable() (bo *pb.BlockOffset, err error) { + readPos := len(ss.f.Data) + + // Read checksum len from the last 4 bytes. + readPos -= 4 + buf := ss.readCheckError(readPos, 4) + checksumLen := int(utils.BytesToU32(buf)) + if checksumLen < 0 { + return nil, errors.New("checksum length less than zero. Data corrupted") + } + + // Read checksum. + readPos -= checksumLen + expectedChk := ss.readCheckError(readPos, checksumLen) + + // Read index size from the footer. + readPos -= 4 + buf = ss.readCheckError(readPos, 4) + ss.idxLen = int(utils.BytesToU32(buf)) + + // Read index. + readPos -= ss.idxLen + ss.idxStart = readPos + data := ss.readCheckError(readPos, ss.idxLen) + if err := utils.VerifyChecksum(data, expectedChk); err != nil { + return nil, errors.Wrapf(err, "failed to verify checksum for table: %s", ss.f.Fd.Name()) + } + indexTable := &pb.TableIndex{} + if err := proto.Unmarshal(data, indexTable); err != nil { + return nil, err + } + ss.idxTables = indexTable + + ss.hasBloomFilter = len(indexTable.BloomFilter) > 0 + if len(indexTable.GetOffsets()) > 0 { + return indexTable.GetOffsets()[0], nil + } + return nil, errors.New("read index fail, offset is nil") +} + +// Close 关闭 +func (ss *SSTable) Close() error { + return ss.f.Close() +} + +// Indexs _ +func (ss *SSTable) Indexs() *pb.TableIndex { + return ss.idxTables +} + +// MaxKey 当前最大的key +func (ss *SSTable) MaxKey() []byte { + return ss.maxKey +} + +// MinKey 当前最小的key +func (ss *SSTable) MinKey() []byte { + return ss.minKey +} + +// FID 获取fid +func (ss *SSTable) FID() uint64 { + return ss.fid +} + +// HasBloomFilter _ +func (ss *SSTable) HasBloomFilter() bool { + return ss.hasBloomFilter +} + +func (ss *SSTable) read(off, sz int) ([]byte, error) { + if len(ss.f.Data) > 0 { + if len(ss.f.Data[off:]) < sz { + return nil, io.EOF + } + return ss.f.Data[off : off+sz], nil + } + + res := make([]byte, sz) + _, err := ss.f.Fd.ReadAt(res, int64(off)) + return res, err +} +func (ss *SSTable) readCheckError(off, sz int) []byte { + buf, err := ss.read(off, sz) + utils.Panic(err) + return buf +} + +// Bytes returns data starting from offset off of size sz. If there's not enough data, it would +// return nil slice and io.EOF. +func (ss *SSTable) Bytes(off, sz int) ([]byte, error) { + return ss.f.Bytes(off, sz) +} + +// Size 返回底层文件的尺寸 +func (ss *SSTable) Size() int64 { + fileStats, err := ss.f.Fd.Stat() + utils.Panic(err) + return fileStats.Size() +} + +// GetCreatedAt _ +func (ss *SSTable) GetCreatedAt() *time.Time { + return &ss.createdAt +} + +// SetCreatedAt _ +func (ss *SSTable) SetCreatedAt(t *time.Time) { + ss.createdAt = *t +} + +// Detele _ +func (ss *SSTable) Detele() error { + return ss.f.Delete() +} + +// Truncature _ +func (ss *SSTable) Truncature(size int64) error { + return ss.f.Truncature(size) +} diff --git a/file/vlog.go b/file/vlog.go index b691ba5..e134f32 100644 --- a/file/vlog.go +++ b/file/vlog.go @@ -1 +1,187 @@ package file + +import ( + "bytes" + "encoding/binary" + "fmt" + "hash/crc32" + "io" + "math" + "os" + "sync" + "sync/atomic" + + "github.com/hardcore-os/corekv/utils" + "github.com/pkg/errors" +) + +type LogFile struct { + Lock sync.RWMutex + FID uint32 + size uint32 + f *MmapFile +} + +func (lf *LogFile) Open(opt *Options) error { + var err error + lf.FID = uint32(opt.FID) + lf.Lock = sync.RWMutex{} + lf.f, err = OpenMmapFile(opt.FileName, os.O_CREATE|os.O_RDWR, opt.MaxSz) + utils.Panic2(nil, err) + fi, err := lf.f.Fd.Stat() + if err != nil { + return utils.WarpErr("Unable to run file.Stat", err) + } + // 获取文件尺寸 + sz := fi.Size() + utils.CondPanic(sz > math.MaxUint32, fmt.Errorf("file size: %d greater than %d", + uint32(sz), uint32(math.MaxUint32))) + lf.size = uint32(sz) + // TODO 是否要在这里弄一个header放一些元数据呢? + return nil +} + +// Acquire lock on mmap/file if you are calling this +func (lf *LogFile) Read(p *utils.ValuePtr) (buf []byte, err error) { + offset := p.Offset + // Do not convert size to uint32, because the lf.fmap can be of size + // 4GB, which overflows the uint32 during conversion to make the size 0, + // causing the read to fail with ErrEOF. See issue #585. + size := int64(len(lf.f.Data)) + valsz := p.Len + lfsz := atomic.LoadUint32(&lf.size) + if int64(offset) >= size || int64(offset+valsz) > size || + // Ensure that the read is within the file's actual size. It might be possible that + // the offset+valsz length is beyond the file's actual size. This could happen when + // dropAll and iterations are running simultaneously. + int64(offset+valsz) > int64(lfsz) { + err = io.EOF + } else { + buf, err = lf.f.Bytes(int(offset), int(valsz)) + } + return buf, err +} + +func (lf *LogFile) DoneWriting(offset uint32) error { + // Sync before acquiring lock. (We call this from write() and thus know we have shared access + // to the fd.) + if err := lf.f.Sync(); err != nil { + return errors.Wrapf(err, "Unable to sync value log: %q", lf.FileName()) + } + + // 写嘛 总是要锁一下的 + lf.Lock.Lock() + defer lf.Lock.Unlock() + + // TODO: Confirm if we need to run a file sync after truncation. + // Truncation must run after unmapping, otherwise Windows would crap itself. + if err := lf.f.Truncature(int64(offset)); err != nil { + return errors.Wrapf(err, "Unable to truncate file: %q", lf.FileName()) + } + + // Reinitialize the log file. This will mmap the entire file. + if err := lf.Init(); err != nil { + return errors.Wrapf(err, "failed to initialize file %s", lf.FileName()) + } + + // Previously we used to close the file after it was written and reopen it in read-only mode. + // We no longer open files in read-only mode. We keep all vlog files open in read-write mode. + return nil +} +func (lf *LogFile) Write(offset uint32, buf []byte) (err error) { + return lf.f.AppendBuffer(offset, buf) +} +func (lf *LogFile) Truncate(offset int64) error { + return lf.f.Truncature(offset) +} +func (lf *LogFile) Close() error { + return lf.f.Close() +} + +func (lf *LogFile) Size() int64 { + return int64(atomic.LoadUint32(&lf.size)) +} +func (lf *LogFile) AddSize(offset uint32) { + atomic.StoreUint32(&lf.size, offset) +} + +// 完成log文件的初始化 +func (lf *LogFile) Bootstrap() error { + // TODO 是否需要初始化一些内容给vlog文件? + return nil +} + +func (lf *LogFile) Init() error { + fstat, err := lf.f.Fd.Stat() + if err != nil { + return errors.Wrapf(err, "Unable to check stat for %q", lf.FileName()) + } + sz := fstat.Size() + if sz == 0 { + // File is empty. We don't need to mmap it. Return. + return nil + } + utils.CondPanic(sz > math.MaxUint32, fmt.Errorf("[LogFile.Init] sz > math.MaxUint32")) + lf.size = uint32(sz) + return nil +} +func (lf *LogFile) FileName() string { + return lf.f.Fd.Name() +} + +func (lf *LogFile) Seek(offset int64, whence int) (ret int64, err error) { + return lf.f.Fd.Seek(offset, whence) +} + +func (lf *LogFile) FD() *os.File { + return lf.f.Fd +} + +// You must hold lf.lock to sync() +func (lf *LogFile) Sync() error { + return lf.f.Sync() +} + +// encodeEntry will encode entry to the buf +// layout of entry +// +--------+-----+-------+-------+ +// | header | key | value | crc32 | +// +--------+-----+-------+-------+ +func (lf *LogFile) EncodeEntry(e *utils.Entry, buf *bytes.Buffer, offset uint32) (int, error) { + h := utils.Header{ + KLen: uint32(len(e.Key)), + VLen: uint32(len(e.Value)), + ExpiresAt: e.ExpiresAt, + Meta: e.Meta, + } + + hash := crc32.New(utils.CastagnoliCrcTable) + writer := io.MultiWriter(buf, hash) + + // encode header. + var headerEnc [utils.MaxHeaderSize]byte + sz := h.Encode(headerEnc[:]) + utils.Panic2(writer.Write(headerEnc[:sz])) + // Encryption is disabled so writing directly to the buffer. + utils.Panic2(writer.Write(e.Key)) + utils.Panic2(writer.Write(e.Value)) + // write crc32 hash. + var crcBuf [crc32.Size]byte + binary.BigEndian.PutUint32(crcBuf[:], hash.Sum32()) + utils.Panic2(buf.Write(crcBuf[:])) + // return encoded length. + return len(headerEnc[:sz]) + len(e.Key) + len(e.Value) + len(crcBuf), nil +} +func (lf *LogFile) DecodeEntry(buf []byte, offset uint32) (*utils.Entry, error) { + var h utils.Header + hlen := h.Decode(buf) + kv := buf[hlen:] + e := &utils.Entry{ + Meta: h.Meta, + ExpiresAt: h.ExpiresAt, + Offset: offset, + Key: kv[:h.KLen], + Value: kv[h.KLen : h.KLen+h.VLen], + } + return e, nil +} diff --git a/file/wal.go b/file/wal.go index 70d745b..b4123ba 100644 --- a/file/wal.go +++ b/file/wal.go @@ -1,24 +1,194 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package file -import "github.com/hardcore-os/corekv/utils/codec" +import ( + "bufio" + "bytes" + "fmt" + "hash/crc32" + "io" + "os" + "sync" + + "github.com/hardcore-os/corekv/utils" + "github.com/pkg/errors" +) +// WalFile _ type WalFile struct { - f *MockFile + lock *sync.RWMutex + f *MmapFile + opts *Options + buf *bytes.Buffer + size uint32 + writeAt uint32 } -// WalFile +// Fid _ +func (wf *WalFile) Fid() uint64 { + return wf.opts.FID +} + +// Close _ func (wf *WalFile) Close() error { + fileName := wf.f.Fd.Name() if err := wf.f.Close(); err != nil { return err } - return nil + return os.Remove(fileName) +} + +// Name _ +func (wf *WalFile) Name() string { + return wf.f.Fd.Name() +} + +// Size 当前已经被写入的数据 +func (wf *WalFile) Size() uint32 { + return wf.writeAt } -func OpenWalFile(opt *Options) *WalFile { return &WalFile{f: OpenMockFile(opt)} } -func (wf *WalFile) Write(entry *codec.Entry) error { +// OpenWalFile _ +func OpenWalFile(opt *Options) *WalFile { + omf, err := OpenMmapFile(opt.FileName, os.O_CREATE|os.O_RDWR, opt.MaxSz) + wf := &WalFile{f: omf, lock: &sync.RWMutex{}, opts: opt} + wf.buf = &bytes.Buffer{} + wf.size = uint32(len(wf.f.Data)) + utils.Err(err) + return wf +} + +func (wf *WalFile) Write(entry *utils.Entry) error { // 落预写日志简单的同步写即可 // 序列化为磁盘结构 - walData := codec.WalCodec(entry) - _, err := wf.f.Write(walData) - return err + wf.lock.Lock() + plen := utils.WalCodec(wf.buf, entry) + buf := wf.buf.Bytes() + utils.Panic(wf.f.AppendBuffer(wf.writeAt, buf)) + wf.writeAt += uint32(plen) + wf.lock.Unlock() + return nil +} + +// Iterate 从磁盘中遍历wal,获得数据 +func (wf *WalFile) Iterate(readOnly bool, offset uint32, fn utils.LogEntry) (uint32, error) { + // For now, read directly from file, because it allows + reader := bufio.NewReader(wf.f.NewReader(int(offset))) + read := SafeRead{ + K: make([]byte, 10), + V: make([]byte, 10), + RecordOffset: offset, + LF: wf, + } + var validEndOffset uint32 = offset +loop: + for { + e, err := read.MakeEntry(reader) + switch { + case err == io.EOF: + break loop + case err == io.ErrUnexpectedEOF || err == utils.ErrTruncate: + break loop + case err != nil: + return 0, err + case e.IsZero(): + break loop + } + + var vp utils.ValuePtr // 给kv分离的设计留下扩展,可以不用考虑其作用 + size := uint32(int(e.LogHeaderLen()) + len(e.Key) + len(e.Value) + crc32.Size) + read.RecordOffset += size + validEndOffset = read.RecordOffset + if err := fn(e, &vp); err != nil { + if err == utils.ErrStop { + break + } + return 0, errors.WithMessage(err, "Iteration function") + } + } + return validEndOffset, nil +} + +// Truncate _ +// TODO Truncate 函数 +func (wf *WalFile) Truncate(end int64) error { + if end <= 0 { + return nil + } + if fi, err := wf.f.Fd.Stat(); err != nil { + return fmt.Errorf("while file.stat on file: %s, error: %v\n", wf.Name(), err) + } else if fi.Size() == end { + return nil + } + wf.size = uint32(end) + return wf.f.Truncature(end) +} + +// 封装kv分离的读操作 +type SafeRead struct { + K []byte + V []byte + + RecordOffset uint32 + LF *WalFile +} + +// MakeEntry _ +func (r *SafeRead) MakeEntry(reader io.Reader) (*utils.Entry, error) { + tee := utils.NewHashReader(reader) + var h utils.WalHeader + hlen, err := h.Decode(tee) + if err != nil { + return nil, err + } + if h.KeyLen > uint32(1<<16) { // Key length must be below uint16. + return nil, utils.ErrTruncate + } + kl := int(h.KeyLen) + if cap(r.K) < kl { + r.K = make([]byte, 2*kl) + } + vl := int(h.ValueLen) + if cap(r.V) < vl { + r.V = make([]byte, 2*vl) + } + + e := &utils.Entry{} + e.Offset = r.RecordOffset + e.Hlen = hlen + buf := make([]byte, h.KeyLen+h.ValueLen) + if _, err := io.ReadFull(tee, buf[:]); err != nil { + if err == io.EOF { + err = utils.ErrTruncate + } + return nil, err + } + e.Key = buf[:h.KeyLen] + e.Value = buf[h.KeyLen:] + var crcBuf [crc32.Size]byte + if _, err := io.ReadFull(reader, crcBuf[:]); err != nil { + if err == io.EOF { + err = utils.ErrTruncate + } + return nil, err + } + crc := utils.BytesToU32(crcBuf[:]) + if crc != tee.Sum32() { + return nil, utils.ErrTruncate + } + e.ExpiresAt = h.ExpiresAt + return e, nil } diff --git a/gen.sh b/gen.sh new file mode 100755 index 0000000..0b5f449 --- /dev/null +++ b/gen.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +### Copyright hardcore-os Project Authors +### + # Licensed under the Apache License, Version 2.0 (the "License") + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. +protoDir="pb" +outDir="pb" +protoc -I ${protoDir}/ ${protoDir}/pb.proto --gofast_out=plugins=grpc:${outDir} \ No newline at end of file diff --git a/go.mod b/go.mod index 87c7369..4fd9730 100644 --- a/go.mod +++ b/go.mod @@ -2,4 +2,13 @@ module github.com/hardcore-os/corekv go 1.16 -require github.com/stretchr/testify v1.7.0 +require ( + github.com/cespare/xxhash/v2 v2.1.2 + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/golang/protobuf v1.5.2 + github.com/pkg/errors v0.9.1 + github.com/stretchr/testify v1.7.0 + golang.org/x/sys v0.0.0-20210910150752-751e447fb3d0 + golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect + google.golang.org/protobuf v1.27.1 // indirect +) diff --git a/go.sum b/go.sum index b380ae4..14b8728 100644 --- a/go.sum +++ b/go.sum @@ -1,10 +1,30 @@ -github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= +github.com/cespare/xxhash/v2 v2.1.2 h1:YRXhKfTDauu4ajMg1TPgFO5jnlC2HCbmLXMcTG5cbYE= +github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw= +github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +golang.org/x/sys v0.0.0-20210910150752-751e447fb3d0 h1:xrCZDmdtoloIiooiA9q0OQb9r8HejIHYoHGhGCe1pGg= +golang.org/x/sys v0.0.0-20210910150752-751e447fb3d0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.27.1 h1:SnqbnDw1V7RiZcXPx5MEeqPv2s79L9i7BJUlG/+RurQ= +google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/iterator.go b/iterator.go index d13ace2..465de15 100644 --- a/iterator.go +++ b/iterator.go @@ -1,39 +1,93 @@ +// Copyright 2021 logicrec Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package corekv import ( - "github.com/hardcore-os/corekv/iterator" - "github.com/hardcore-os/corekv/utils/codec" + "github.com/hardcore-os/corekv/lsm" + "github.com/hardcore-os/corekv/utils" ) type DBIterator struct { - iters []iterator.Iterator + iitr utils.Iterator + vlog *valueLog } type Item struct { - e *codec.Entry + e *utils.Entry } -func (it *Item) Entry() *codec.Entry { +func (it *Item) Entry() *utils.Entry { return it.e } -func (db *DB) NewIterator(opt *iterator.Options) iterator.Iterator { - dbIter := &DBIterator{} - dbIter.iters = make([]iterator.Iterator, 0) - dbIter.iters = append(dbIter.iters, db.lsm.NewIterator(opt)) - return dbIter +func (db *DB) NewIterator(opt *utils.Options) utils.Iterator { + iters := make([]utils.Iterator, 0) + iters = append(iters, db.lsm.NewIterators(opt)...) + + res := &DBIterator{ + vlog: db.vlog, + iitr: lsm.NewMergeIterator(iters, opt.IsAsc), + } + return res } func (iter *DBIterator) Next() { - iter.iters[0].Next() + iter.iitr.Next() + for ; iter.Valid() && iter.Item() == nil; iter.iitr.Next() { + } } func (iter *DBIterator) Valid() bool { - return iter.iters[0].Valid() + return iter.iitr.Valid() } func (iter *DBIterator) Rewind() { - iter.iters[0].Rewind() + iter.iitr.Rewind() + for ; iter.Valid() && iter.Item() == nil; iter.iitr.Next() { + } } -func (iter *DBIterator) Item() iterator.Item { - return iter.iters[0].Item() +func (iter *DBIterator) Item() utils.Item { + // 检查从lsm拿到的value是否是value ptr,是则从vlog中拿值 + e := iter.iitr.Item().Entry() + var value []byte + + if e != nil && utils.IsValuePtr(e) { + var vp utils.ValuePtr + vp.Decode(e.Value) + result, cb, err := iter.vlog.read(&vp) + defer utils.RunCallback(cb) + if err != nil { + return nil + } + value = utils.SafeCopy(nil, result) + } + + if e.IsDeletedOrExpired() || value == nil { + return nil + } + + res := &utils.Entry{ + Key: e.Key, + Value: value, + ExpiresAt: e.ExpiresAt, + Meta: e.Meta, + Version: e.Version, + Offset: e.Offset, + Hlen: e.Hlen, + ValThreshold: e.ValThreshold, + } + return res } func (iter *DBIterator) Close() error { - return nil + return iter.iitr.Close() +} +func (iter *DBIterator) Seek(key []byte) { } diff --git a/iterator/iterator.go b/iterator/iterator.go deleted file mode 100644 index f6c710a..0000000 --- a/iterator/iterator.go +++ /dev/null @@ -1,19 +0,0 @@ -package iterator - -import "github.com/hardcore-os/corekv/utils/codec" - -// 迭代器 -type Iterator interface { - Next() - Valid() bool - Rewind() - Item() Item - Close() error -} -type Item interface { - Entry() *codec.Entry -} -type Options struct { - Prefix []byte - IsAsc bool -} diff --git a/lsm/builder.go b/lsm/builder.go new file mode 100644 index 0000000..a6d9c80 --- /dev/null +++ b/lsm/builder.go @@ -0,0 +1,479 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package lsm + +import ( + "bytes" + "errors" + "fmt" + "io" + "math" + "os" + "sort" + "unsafe" + + "github.com/hardcore-os/corekv/file" + "github.com/hardcore-os/corekv/pb" + "github.com/hardcore-os/corekv/utils" +) + +type tableBuilder struct { + sstSize int64 + curBlock *block + opt *Options + blockList []*block + keyCount uint32 + keyHashes []uint32 + maxVersion uint64 + baseKey []byte + staleDataSize int + estimateSz int64 +} +type buildData struct { + blockList []*block + index []byte + checksum []byte + size int +} +type block struct { + offset int //当前block的offset 首地址 + checksum []byte + entriesIndexStart int + chkLen int + data []byte + baseKey []byte + entryOffsets []uint32 + end int + estimateSz int64 +} + +type header struct { + overlap uint16 // Overlap with base key. + diff uint16 // Length of the diff. +} + +const headerSize = uint16(unsafe.Sizeof(header{})) + +// Decode decodes the header. +func (h *header) decode(buf []byte) { + copy(((*[headerSize]byte)(unsafe.Pointer(h))[:]), buf[:headerSize]) +} + +func (h header) encode() []byte { + var b [4]byte + *(*header)(unsafe.Pointer(&b[0])) = h + return b[:] +} + +func (tb *tableBuilder) add(e *utils.Entry, isStale bool) { + key := e.Key + val := utils.ValueStruct{ + Meta: e.Meta, + Value: e.Value, + ExpiresAt: e.ExpiresAt, + } + // 检查是否需要分配一个新的 block + if tb.tryFinishBlock(e) { + if isStale { + // This key will be added to tableIndex and it is stale. + tb.staleDataSize += len(key) + 4 /* len */ + 4 /* offset */ + } + tb.finishBlock() + // Create a new block and start writing. + tb.curBlock = &block{ + data: make([]byte, tb.opt.BlockSize), // TODO 加密block后块的大小会增加,需要预留一些填充位置 + } + } + tb.keyHashes = append(tb.keyHashes, utils.Hash(utils.ParseKey(key))) + + if version := utils.ParseTs(key); version > tb.maxVersion { + tb.maxVersion = version + } + + var diffKey []byte + if len(tb.curBlock.baseKey) == 0 { + tb.curBlock.baseKey = append(tb.curBlock.baseKey[:0], key...) + diffKey = key + } else { + diffKey = tb.keyDiff(key) + } + utils.CondPanic(!(len(key)-len(diffKey) <= math.MaxUint16), fmt.Errorf("tableBuilder.add: len(key)-len(diffKey) <= math.MaxUint16")) + utils.CondPanic(!(len(diffKey) <= math.MaxUint16), fmt.Errorf("tableBuilder.add: len(diffKey) <= math.MaxUint16")) + + h := header{ + overlap: uint16(len(key) - len(diffKey)), + diff: uint16(len(diffKey)), + } + + tb.curBlock.entryOffsets = append(tb.curBlock.entryOffsets, uint32(tb.curBlock.end)) + + tb.append(h.encode()) + tb.append(diffKey) + + dst := tb.allocate(int(val.EncodedSize())) + val.EncodeValue(dst) +} +func newTableBuilerWithSSTSize(opt *Options, size int64) *tableBuilder { + return &tableBuilder{ + opt: opt, + sstSize: size, + } +} +func newTableBuiler(opt *Options) *tableBuilder { + return &tableBuilder{ + opt: opt, + sstSize: opt.SSTableMaxSz, + } +} + +// Empty returns whether it's empty. +func (tb *tableBuilder) empty() bool { return len(tb.keyHashes) == 0 } + +func (tb *tableBuilder) finish() []byte { + bd := tb.done() + buf := make([]byte, bd.size) + written := bd.Copy(buf) + utils.CondPanic(written == len(buf), nil) + return buf +} +func (tb *tableBuilder) tryFinishBlock(e *utils.Entry) bool { + if tb.curBlock == nil { + return true + } + + if len(tb.curBlock.entryOffsets) <= 0 { + return false + } + utils.CondPanic(!((uint32(len(tb.curBlock.entryOffsets))+1)*4+4+8+4 < math.MaxUint32), errors.New("Integer overflow")) + entriesOffsetsSize := int64((len(tb.curBlock.entryOffsets)+1)*4 + + 4 + // size of list + 8 + // Sum64 in checksum proto + 4) // checksum length + tb.curBlock.estimateSz = int64(tb.curBlock.end) + int64(6 /*header size for entry*/) + + int64(len(e.Key)) + int64(e.EncodedSize()) + entriesOffsetsSize + + // Integer overflow check for table size. + utils.CondPanic(!(uint64(tb.curBlock.end)+uint64(tb.curBlock.estimateSz) < math.MaxUint32), errors.New("Integer overflow")) + + return tb.curBlock.estimateSz > int64(tb.opt.BlockSize) +} + +// AddStaleKey 记录陈旧key所占用的空间大小,用于日志压缩时的决策 +func (tb *tableBuilder) AddStaleKey(e *utils.Entry) { + // Rough estimate based on how much space it will occupy in the SST. + tb.staleDataSize += len(e.Key) + len(e.Value) + 4 /* entry offset */ + 4 /* header size */ + tb.add(e, true) +} + +// AddKey _ +func (tb *tableBuilder) AddKey(e *utils.Entry) { + tb.add(e, false) +} + +// Close closes the TableBuilder. +func (tb *tableBuilder) Close() { + // 结合内存分配器 +} +func (tb *tableBuilder) finishBlock() { + if tb.curBlock == nil || len(tb.curBlock.entryOffsets) == 0 { + return + } + // Append the entryOffsets and its length. + tb.append(utils.U32SliceToBytes(tb.curBlock.entryOffsets)) + tb.append(utils.U32ToBytes(uint32(len(tb.curBlock.entryOffsets)))) + + checksum := tb.calculateChecksum(tb.curBlock.data[:tb.curBlock.end]) + + // Append the block checksum and its length. + tb.append(checksum) + tb.append(utils.U32ToBytes(uint32(len(checksum)))) + tb.estimateSz += tb.curBlock.estimateSz + tb.blockList = append(tb.blockList, tb.curBlock) + // TODO: 预估整理builder写入磁盘后,sst文件的大小 + tb.keyCount += uint32(len(tb.curBlock.entryOffsets)) + tb.curBlock = nil // 表示当前block 已经被序列化到内存 + return +} + +// append appends to curBlock.data +func (tb *tableBuilder) append(data []byte) { + dst := tb.allocate(len(data)) + utils.CondPanic(len(data) != copy(dst, data), errors.New("tableBuilder.append data")) +} + +func (tb *tableBuilder) allocate(need int) []byte { + bb := tb.curBlock + if len(bb.data[bb.end:]) < need { + // We need to reallocate. + sz := 2 * len(bb.data) + if bb.end+need > sz { + sz = bb.end + need + } + tmp := make([]byte, sz) // todo 这里可以使用内存分配器来提升性能 + copy(tmp, bb.data) + bb.data = tmp + } + bb.end += need + return bb.data[bb.end-need : bb.end] +} + +func (tb *tableBuilder) calculateChecksum(data []byte) []byte { + checkSum := utils.CalculateChecksum(data) + return utils.U64ToBytes(checkSum) +} + +func (tb *tableBuilder) keyDiff(newKey []byte) []byte { + var i int + for i = 0; i < len(newKey) && i < len(tb.curBlock.baseKey); i++ { + if newKey[i] != tb.curBlock.baseKey[i] { + break + } + } + return newKey[i:] +} + +// TODO: 这里存在多次的用户空间拷贝过程,需要优化 +func (tb *tableBuilder) flush(lm *levelManager, tableName string) (t *table, err error) { + bd := tb.done() + t = &table{lm: lm, fid: utils.FID(tableName)} + // 如果没有builder 则创打开一个已经存在的sst文件 + t.ss = file.OpenSStable(&file.Options{ + FileName: tableName, + Dir: lm.opt.WorkDir, + Flag: os.O_CREATE | os.O_RDWR, + MaxSz: int(bd.size)}) + buf := make([]byte, bd.size) + written := bd.Copy(buf) + utils.CondPanic(written != len(buf), fmt.Errorf("tableBuilder.flush written != len(buf)")) + dst, err := t.ss.Bytes(0, bd.size) + if err != nil { + return nil, err + } + copy(dst, buf) + return t, nil +} + +func (bd *buildData) Copy(dst []byte) int { + var written int + for _, bl := range bd.blockList { + written += copy(dst[written:], bl.data[:bl.end]) + } + written += copy(dst[written:], bd.index) + written += copy(dst[written:], utils.U32ToBytes(uint32(len(bd.index)))) + + written += copy(dst[written:], bd.checksum) + written += copy(dst[written:], utils.U32ToBytes(uint32(len(bd.checksum)))) + return written +} + +func (tb *tableBuilder) done() buildData { + tb.finishBlock() + if len(tb.blockList) == 0 { + return buildData{} + } + bd := buildData{ + blockList: tb.blockList, + } + + var f utils.Filter + if tb.opt.BloomFalsePositive > 0 { + bits := utils.BloomBitsPerKey(len(tb.keyHashes), tb.opt.BloomFalsePositive) + f = utils.NewFilter(tb.keyHashes, bits) + } + // TODO 构建 sst的索引 + index, dataSize := tb.buildIndex(f) + checksum := tb.calculateChecksum(index) + bd.index = index + bd.checksum = checksum + bd.size = int(dataSize) + len(index) + len(checksum) + 4 + 4 + return bd +} + +func (tb *tableBuilder) buildIndex(bloom []byte) ([]byte, uint32) { + tableIndex := &pb.TableIndex{} + if len(bloom) > 0 { + tableIndex.BloomFilter = bloom + } + tableIndex.KeyCount = tb.keyCount + tableIndex.MaxVersion = tb.maxVersion + tableIndex.Offsets = tb.writeBlockOffsets(tableIndex) + var dataSize uint32 + for i := range tb.blockList { + dataSize += uint32(tb.blockList[i].end) + } + data, err := tableIndex.Marshal() + utils.Panic(err) + return data, dataSize +} + +func (tb *tableBuilder) writeBlockOffsets(tableIndex *pb.TableIndex) []*pb.BlockOffset { + var startOffset uint32 + var offsets []*pb.BlockOffset + for _, bl := range tb.blockList { + offset := tb.writeBlockOffset(bl, startOffset) + offsets = append(offsets, offset) + startOffset += uint32(bl.end) + } + return offsets +} + +func (b *tableBuilder) writeBlockOffset(bl *block, startOffset uint32) *pb.BlockOffset { + offset := &pb.BlockOffset{} + offset.Key = bl.baseKey + offset.Len = uint32(bl.end) + offset.Offset = startOffset + return offset +} + +// TODO: 如何能更好的预估builder的长度呢? +func (b *tableBuilder) ReachedCapacity() bool { + return b.estimateSz > b.sstSize +} + +func (b block) verifyCheckSum() error { + return utils.VerifyChecksum(b.data, b.checksum) +} + +type blockIterator struct { + data []byte + idx int + err error + baseKey []byte + key []byte + val []byte + entryOffsets []uint32 + block *block + + tableID uint64 + blockID int + + prevOverlap uint16 + + it utils.Item +} + +func (itr *blockIterator) setBlock(b *block) { + itr.block = b + itr.err = nil + itr.idx = 0 + itr.baseKey = itr.baseKey[:0] + itr.prevOverlap = 0 + itr.key = itr.key[:0] + itr.val = itr.val[:0] + // Drop the index from the block. We don't need it anymore. + itr.data = b.data[:b.entriesIndexStart] + itr.entryOffsets = b.entryOffsets +} + +// seekToFirst brings us to the first element. +func (itr *blockIterator) seekToFirst() { + itr.setIdx(0) +} +func (itr *blockIterator) seekToLast() { + itr.setIdx(len(itr.entryOffsets) - 1) +} +func (itr *blockIterator) seek(key []byte) { + itr.err = nil + startIndex := 0 // This tells from which index we should start binary search. + + foundEntryIdx := sort.Search(len(itr.entryOffsets), func(idx int) bool { + // If idx is less than start index then just return false. + if idx < startIndex { + return false + } + itr.setIdx(idx) + return utils.CompareKeys(itr.key, key) >= 0 + }) + itr.setIdx(foundEntryIdx) +} + +func (itr *blockIterator) setIdx(i int) { + itr.idx = i + if i >= len(itr.entryOffsets) || i < 0 { + itr.err = io.EOF + return + } + itr.err = nil + startOffset := int(itr.entryOffsets[i]) + + // Set base key. + if len(itr.baseKey) == 0 { + var baseHeader header + baseHeader.decode(itr.data) + itr.baseKey = itr.data[headerSize : headerSize+baseHeader.diff] + } + + var endOffset int + // idx points to the last entry in the block. + if itr.idx+1 == len(itr.entryOffsets) { + endOffset = len(itr.data) + } else { + // idx point to some entry other than the last one in the block. + // EndOffset of the current entry is the start offset of the next entry. + endOffset = int(itr.entryOffsets[itr.idx+1]) + } + defer func() { + if r := recover(); r != nil { + var debugBuf bytes.Buffer + fmt.Fprintf(&debugBuf, "==== Recovered====\n") + fmt.Fprintf(&debugBuf, "Table ID: %d\nBlock ID: %d\nEntry Idx: %d\nData len: %d\n"+ + "StartOffset: %d\nEndOffset: %d\nEntryOffsets len: %d\nEntryOffsets: %v\n", + itr.tableID, itr.blockID, itr.idx, len(itr.data), startOffset, endOffset, + len(itr.entryOffsets), itr.entryOffsets) + panic(debugBuf.String()) + } + }() + + entryData := itr.data[startOffset:endOffset] + var h header + h.decode(entryData) + if h.overlap > itr.prevOverlap { + itr.key = append(itr.key[:itr.prevOverlap], itr.baseKey[itr.prevOverlap:h.overlap]...) + } + + itr.prevOverlap = h.overlap + valueOff := headerSize + h.diff + diffKey := entryData[headerSize:valueOff] + itr.key = append(itr.key[:h.overlap], diffKey...) + e := &utils.Entry{Key: itr.key} + val := &utils.ValueStruct{} + val.DecodeValue(entryData[valueOff:]) + itr.val = val.Value + e.Value = val.Value + e.ExpiresAt = val.ExpiresAt + e.Meta = val.Meta + itr.it = &Item{e: e} +} + +func (itr *blockIterator) Error() error { + return itr.err +} + +func (itr *blockIterator) Next() { + itr.setIdx(itr.idx + 1) +} + +func (itr *blockIterator) Valid() bool { + return itr.err != io.EOF // TODO 这里用err比较好 +} +func (itr *blockIterator) Rewind() bool { + itr.setIdx(0) + return true +} +func (itr *blockIterator) Item() utils.Item { + return itr.it +} +func (itr *blockIterator) Close() error { + return nil +} diff --git a/lsm/cache.go b/lsm/cache.go index 24b54b7..4a83c41 100644 --- a/lsm/cache.go +++ b/lsm/cache.go @@ -1,15 +1,34 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package lsm -import "github.com/hardcore-os/corekv/utils" +import ( + coreCache "github.com/hardcore-os/corekv/utils/cache" +) type cache struct { - indexs *utils.CoreMap // key fid, value table - blocks *utils.CoreMap // key cacheID_blockOffset value block []byte + indexs *coreCache.Cache // key fid, value table + blocks *coreCache.Cache // key fid_blockOffset value block []byte } + type blockBuffer struct { b []byte } +const defaultCacheSize = 1024 + // close func (c *cache) close() error { return nil @@ -17,11 +36,10 @@ func (c *cache) close() error { // newCache func newCache(opt *Options) *cache { - return &cache{indexs: utils.NewMap(), blocks: utils.NewMap()} + return &cache{indexs: coreCache.NewCache(defaultCacheSize), blocks: coreCache.NewCache(defaultCacheSize)} } - // TODO fid 使用字符串是不是会有性能损耗 -func (c *cache) addIndex(fid string, t *table) { +func (c *cache) addIndex(fid uint64, t *table) { c.indexs.Set(fid, t) } diff --git a/lsm/compact.go b/lsm/compact.go new file mode 100644 index 0000000..1e9c33e --- /dev/null +++ b/lsm/compact.go @@ -0,0 +1,1164 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package lsm + +import ( + "bytes" + "errors" + "fmt" + "log" + "math" + "math/rand" + "sort" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/hardcore-os/corekv/pb" + "github.com/hardcore-os/corekv/utils" +) + +// 归并优先级 +type compactionPriority struct { + level int + score float64 + adjusted float64 + dropPrefixes [][]byte + t targets +} + +// 归并目标 +type targets struct { + baseLevel int + targetSz []int64 + fileSz []int64 +} +type compactDef struct { + compactorId int + t targets + p compactionPriority + thisLevel *levelHandler + nextLevel *levelHandler + + top []*table + bot []*table + + thisRange keyRange + nextRange keyRange + splits []keyRange + + thisSize int64 + + dropPrefixes [][]byte +} + +func (cd *compactDef) lockLevels() { + cd.thisLevel.RLock() + cd.nextLevel.RLock() +} + +func (cd *compactDef) unlockLevels() { + cd.nextLevel.RUnlock() + cd.thisLevel.RUnlock() +} + +// runCompacter 启动一个compacter +func (lm *levelManager) runCompacter(id int) { + defer lm.lsm.closer.Done() + randomDelay := time.NewTimer(time.Duration(rand.Int31n(1000)) * time.Millisecond) + select { + case <-randomDelay.C: + case <-lm.lsm.closer.CloseSignal: + randomDelay.Stop() + return + } + //TODO 这个值有待验证 + ticker := time.NewTicker(50000 * time.Millisecond) + defer ticker.Stop() + for { + select { + // Can add a done channel or other stuff. + case <-ticker.C: + lm.runOnce(id) + case <-lm.lsm.closer.CloseSignal: + return + } + } +} + +// runOnce +func (lm *levelManager) runOnce(id int) bool { + prios := lm.pickCompactLevels() + if id == 0 { + // 0号协程 总是倾向于压缩l0层 + prios = moveL0toFront(prios) + } + for _, p := range prios { + if id == 0 && p.level == 0 { + // 对于l0 无论得分多少都要运行 + } else if p.adjusted < 1.0 { + // 对于其他level 如果等分小于 则不执行 + break + } + if lm.run(id, p) { + return true + } + } + return false +} +func moveL0toFront(prios []compactionPriority) []compactionPriority { + idx := -1 + for i, p := range prios { + if p.level == 0 { + idx = i + break + } + } + // If idx == -1, we didn't find L0. + // If idx == 0, then we don't need to do anything. L0 is already at the front. + if idx > 0 { + out := append([]compactionPriority{}, prios[idx]) + out = append(out, prios[:idx]...) + out = append(out, prios[idx+1:]...) + return out + } + return prios +} + +// run 执行一个优先级指定的合并任务 +func (lm *levelManager) run(id int, p compactionPriority) bool { + err := lm.doCompact(id, p) + switch err { + case nil: + return true + case utils.ErrFillTables: + // 什么也不做,此时合并过程被忽略 + default: + log.Printf("[taskID:%d] While running doCompact: %v\n ", id, err) + } + return false +} + +// doCompact 选择level的某些表合并到目标level +func (lm *levelManager) doCompact(id int, p compactionPriority) error { + l := p.level + utils.CondPanic(l >= lm.opt.MaxLevelNum, errors.New("[doCompact] Sanity check. l >= lm.opt.MaxLevelNum")) // Sanity check. + if p.t.baseLevel == 0 { + p.t = lm.levelTargets() + } + // 创建真正的压缩计划 + cd := compactDef{ + compactorId: id, + p: p, + t: p.t, + thisLevel: lm.levels[l], + dropPrefixes: p.dropPrefixes, + } + + // 如果是第0层 对齐单独填充处理 + if l == 0 { + cd.nextLevel = lm.levels[p.t.baseLevel] + if !lm.fillTablesL0(&cd) { + return utils.ErrFillTables + } + } else { + cd.nextLevel = cd.thisLevel + // 如果不是最后一层,则压缩到下一层即可 + if !cd.thisLevel.isLastLevel() { + cd.nextLevel = lm.levels[l+1] + } + if !lm.fillTables(&cd) { + return utils.ErrFillTables + } + } + // 完成合并后 从合并状态中删除 + defer lm.compactState.delete(cd) // Remove the ranges from compaction status. + + // 执行合并计划 + if err := lm.runCompactDef(id, l, cd); err != nil { + // This compaction couldn't be done successfully. + log.Printf("[Compactor: %d] LOG Compact FAILED with error: %+v: %+v", id, err, cd) + return err + } + + log.Printf("[Compactor: %d] Compaction for level: %d DONE", id, cd.thisLevel.levelNum) + return nil +} + +// pickCompactLevel 选择合适的level执行合并,返回判断的优先级 +func (lm *levelManager) pickCompactLevels() (prios []compactionPriority) { + t := lm.levelTargets() + addPriority := func(level int, score float64) { + pri := compactionPriority{ + level: level, + score: score, + adjusted: score, + t: t, + } + prios = append(prios, pri) + } + + // 根据l0表的table数量来对压缩提权 + addPriority(0, float64(lm.levels[0].numTables())/float64(lm.opt.NumLevelZeroTables)) + + // 非l0 层都根据大小计算优先级 + for i := 1; i < len(lm.levels); i++ { + // 处于压缩状态的sst 不能计算在内 + delSize := lm.compactState.delSize(i) + l := lm.levels[i] + sz := l.getTotalSize() - delSize + // score的计算是 扣除正在合并的表后的尺寸与目标sz的比值 + addPriority(i, float64(sz)/float64(t.targetSz[i])) + } + utils.CondPanic(len(prios) != len(lm.levels), errors.New("[pickCompactLevels] len(prios) != len(lm.levels)")) + + // 调整得分 + var prevLevel int + for level := t.baseLevel; level < len(lm.levels); level++ { + if prios[prevLevel].adjusted >= 1 { + // 避免过大的得分 + const minScore = 0.01 + if prios[level].score >= minScore { + prios[prevLevel].adjusted /= prios[level].adjusted + } else { + prios[prevLevel].adjusted /= minScore + } + } + prevLevel = level + } + + // 仅选择得分大于1的压缩内容,并且允许l0到l0的特殊压缩,为了提升查询性能允许l0层独自压缩 + out := prios[:0] + for _, p := range prios[:len(prios)-1] { + if p.score >= 1.0 { + out = append(out, p) + } + } + prios = out + + // 按优先级排序 + sort.Slice(prios, func(i, j int) bool { + return prios[i].adjusted > prios[j].adjusted + }) + return prios +} +func (lm *levelManager) lastLevel() *levelHandler { + return lm.levels[len(lm.levels)-1] +} + +// levelTargets +func (lm *levelManager) levelTargets() targets { + adjust := func(sz int64) int64 { + if sz < lm.opt.BaseLevelSize { + return lm.opt.BaseLevelSize + } + return sz + } + + // 初始化默认都是最大层级 + t := targets{ + targetSz: make([]int64, len(lm.levels)), + fileSz: make([]int64, len(lm.levels)), + } + // 从最后一个level开始计算 + dbSize := lm.lastLevel().getTotalSize() + for i := len(lm.levels) - 1; i > 0; i-- { + leveTargetSize := adjust(dbSize) + t.targetSz[i] = leveTargetSize + // 如果当前的level没有达到合并的要求 + if t.baseLevel == 0 && leveTargetSize <= lm.opt.BaseLevelSize { + t.baseLevel = i + } + dbSize /= int64(lm.opt.LevelSizeMultiplier) + } + + tsz := lm.opt.BaseTableSize + for i := 0; i < len(lm.levels); i++ { + if i == 0 { + // l0选择memtable的size作为文件的尺寸 + t.fileSz[i] = lm.opt.MemTableSize + } else if i <= t.baseLevel { + t.fileSz[i] = tsz + } else { + tsz *= int64(lm.opt.TableSizeMultiplier) + t.fileSz[i] = tsz + } + } + + // 找到最后一个空level作为目标level实现跨level归并,减少写放大 + for i := t.baseLevel + 1; i < len(lm.levels)-1; i++ { + if lm.levels[i].getTotalSize() > 0 { + break + } + t.baseLevel = i + } + + // 如果存在断层,则目标level++ + b := t.baseLevel + lvl := lm.levels + if b < len(lvl)-1 && lvl[b].getTotalSize() == 0 && lvl[b+1].getTotalSize() < t.targetSz[b+1] { + t.baseLevel++ + } + return t +} + +type thisAndNextLevelRLocked struct{} + +func (lm *levelManager) fillTables(cd *compactDef) bool { + cd.lockLevels() + defer cd.unlockLevels() + + tables := make([]*table, cd.thisLevel.numTables()) + copy(tables, cd.thisLevel.tables) + if len(tables) == 0 { + return false + } + // We're doing a maxLevel to maxLevel compaction. Pick tables based on the stale data size. + if cd.thisLevel.isLastLevel() { + return lm.fillMaxLevelTables(tables, cd) + } + // We pick tables, so we compact older tables first. This is similar to + // kOldestLargestSeqFirst in RocksDB. + lm.sortByHeuristic(tables, cd) + + for _, t := range tables { + cd.thisSize = t.Size() + cd.thisRange = getKeyRange(t) + // 如果被压缩过了,则什么都不需要做 + if lm.compactState.overlapsWith(cd.thisLevel.levelNum, cd.thisRange) { + continue + } + cd.top = []*table{t} + left, right := cd.nextLevel.overlappingTables(levelHandlerRLocked{}, cd.thisRange) + + cd.bot = make([]*table, right-left) + copy(cd.bot, cd.nextLevel.tables[left:right]) + + if len(cd.bot) == 0 { + cd.bot = []*table{} + cd.nextRange = cd.thisRange + if !lm.compactState.compareAndAdd(thisAndNextLevelRLocked{}, *cd) { + continue + } + return true + } + cd.nextRange = getKeyRange(cd.bot...) + + if lm.compactState.overlapsWith(cd.nextLevel.levelNum, cd.nextRange) { + continue + } + if !lm.compactState.compareAndAdd(thisAndNextLevelRLocked{}, *cd) { + continue + } + return true + } + return false +} + +// compact older tables first. +func (lm *levelManager) sortByHeuristic(tables []*table, cd *compactDef) { + if len(tables) == 0 || cd.nextLevel == nil { + return + } + + // Sort tables by max version. This is what RocksDB does. + sort.Slice(tables, func(i, j int) bool { + return tables[i].ss.Indexs().MaxVersion < tables[j].ss.Indexs().MaxVersion + }) +} +func (lm *levelManager) runCompactDef(id, l int, cd compactDef) (err error) { + if len(cd.t.fileSz) == 0 { + return errors.New("Filesizes cannot be zero. Targets are not set") + } + timeStart := time.Now() + + thisLevel := cd.thisLevel + nextLevel := cd.nextLevel + + utils.CondPanic(len(cd.splits) != 0, errors.New("len(cd.splits) != 0")) + if thisLevel == nextLevel { + // l0 to l0 和 lmax to lmax 不做特殊处理 + } else { + lm.addSplits(&cd) + } + // 追加一个空的 + if len(cd.splits) == 0 { + cd.splits = append(cd.splits, keyRange{}) + } + + newTables, decr, err := lm.compactBuildTables(l, cd) + if err != nil { + return err + } + defer func() { + // Only assign to err, if it's not already nil. + if decErr := decr(); err == nil { + err = decErr + } + }() + changeSet := buildChangeSet(&cd, newTables) + + // 删除之前先更新manifest文件 + if err := lm.manifestFile.AddChanges(changeSet.Changes); err != nil { + return err + } + + if err := nextLevel.replaceTables(cd.bot, newTables); err != nil { + return err + } + defer decrRefs(cd.top) + if err := thisLevel.deleteTables(cd.top); err != nil { + return err + } + + from := append(tablesToString(cd.top), tablesToString(cd.bot)...) + to := tablesToString(newTables) + if dur := time.Since(timeStart); dur > 2*time.Second { + var expensive string + if dur > time.Second { + expensive = " [E]" + } + fmt.Printf("[%d]%s LOG Compact %d->%d (%d, %d -> %d tables with %d splits)."+ + " [%s] -> [%s], took %v\n", + id, expensive, thisLevel.levelNum, nextLevel.levelNum, len(cd.top), len(cd.bot), + len(newTables), len(cd.splits), strings.Join(from, " "), strings.Join(to, " "), + dur.Round(time.Millisecond)) + } + return nil +} + +// tablesToString +func tablesToString(tables []*table) []string { + var res []string + for _, t := range tables { + res = append(res, fmt.Sprintf("%05d", t.fid)) + } + res = append(res, ".") + return res +} + +// buildChangeSet _ +func buildChangeSet(cd *compactDef, newTables []*table) pb.ManifestChangeSet { + changes := []*pb.ManifestChange{} + for _, table := range newTables { + changes = append(changes, newCreateChange(table.fid, cd.nextLevel.levelNum)) + } + for _, table := range cd.top { + changes = append(changes, newDeleteChange(table.fid)) + } + for _, table := range cd.bot { + changes = append(changes, newDeleteChange(table.fid)) + } + return pb.ManifestChangeSet{Changes: changes} +} + +// +func newDeleteChange(id uint64) *pb.ManifestChange { + return &pb.ManifestChange{ + Id: id, + Op: pb.ManifestChange_DELETE, + } +} + +// newCreateChange +func newCreateChange(id uint64, level int) *pb.ManifestChange { + return &pb.ManifestChange{ + Id: id, + Op: pb.ManifestChange_CREATE, + Level: uint32(level), + } +} + +// compactBuildTables 合并两个层的sst文件 +func (lm *levelManager) compactBuildTables(lev int, cd compactDef) ([]*table, func() error, error) { + + topTables := cd.top + botTables := cd.bot + iterOpt := &utils.Options{ + IsAsc: true, + } + //numTables := int64(len(topTables) + len(botTables)) + newIterator := func() []utils.Iterator { + // Create iterators across all the tables involved first. + var iters []utils.Iterator + switch { + case lev == 0: + iters = append(iters, iteratorsReversed(topTables, iterOpt)...) + case len(topTables) > 0: + iters = []utils.Iterator{topTables[0].NewIterator(iterOpt)} + } + return append(iters, NewConcatIterator(botTables, iterOpt)) + } + + // 开始并行执行压缩过程 + res := make(chan *table, 3) + inflightBuilders := utils.NewThrottle(8 + len(cd.splits)) + for _, kr := range cd.splits { + // Initiate Do here so we can register the goroutines for buildTables too. + if err := inflightBuilders.Do(); err != nil { + return nil, nil, fmt.Errorf("cannot start subcompaction: %+v", err) + } + // 开启一个协程去处理子压缩 + go func(kr keyRange) { + defer inflightBuilders.Done(nil) + it := NewMergeIterator(newIterator(), false) + defer it.Close() + lm.subcompact(it, kr, cd, inflightBuilders, res) + }(kr) + } + + // mapreduce的方式收集table的句柄 + var newTables []*table + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + for t := range res { + newTables = append(newTables, t) + } + }() + + // 在这里等待所有的压缩过程完成 + err := inflightBuilders.Finish() + // channel 资源回收 + close(res) + // 等待所有的builder刷到磁盘 + wg.Wait() + + if err == nil { + // 同步刷盘,保证数据一定落盘 + err = utils.SyncDir(lm.opt.WorkDir) + } + + if err != nil { + // 如果出现错误,则删除索引新创建的文件 + _ = decrRefs(newTables) + return nil, nil, fmt.Errorf("while running compactions for: %+v, %v", cd, err) + } + + sort.Slice(newTables, func(i, j int) bool { + return utils.CompareKeys(newTables[i].ss.MaxKey(), newTables[j].ss.MaxKey()) < 0 + }) + return newTables, func() error { return decrRefs(newTables) }, nil +} + +// 并行的运行子压缩情况 +func (lm *levelManager) addSplits(cd *compactDef) { + cd.splits = cd.splits[:0] + + // Let's say we have 10 tables in cd.bot and min width = 3. Then, we'll pick + // 0, 1, 2 (pick), 3, 4, 5 (pick), 6, 7, 8 (pick), 9 (pick, because last table). + // This gives us 4 picks for 10 tables. + // In an edge case, 142 tables in bottom led to 48 splits. That's too many splits, because it + // then uses up a lot of memory for table builder. + // We should keep it so we have at max 5 splits. + width := int(math.Ceil(float64(len(cd.bot)) / 5.0)) + if width < 3 { + width = 3 + } + skr := cd.thisRange + skr.extend(cd.nextRange) + + addRange := func(right []byte) { + skr.right = utils.Copy(right) + cd.splits = append(cd.splits, skr) + skr.left = skr.right + } + + for i, t := range cd.bot { + // last entry in bottom table. + if i == len(cd.bot)-1 { + addRange([]byte{}) + return + } + if i%width == width-1 { + // 设置最大值为右区间 + right := utils.KeyWithTs(utils.ParseKey(t.ss.MaxKey()), math.MaxUint64) + addRange(right) + } + } +} + +// sortByStaleData 对表中陈旧数据的数量对sst文件进行排序 +func (lm *levelManager) sortByStaleDataSize(tables []*table, cd *compactDef) { + if len(tables) == 0 || cd.nextLevel == nil { + return + } + // TODO 统计一个 sst文件中陈旧数据的数量,涉及对存储格式的修改 + sort.Slice(tables, func(i, j int) bool { + return tables[i].StaleDataSize() > tables[j].StaleDataSize() + }) +} + +// max level 和 max level 的压缩 +func (lm *levelManager) fillMaxLevelTables(tables []*table, cd *compactDef) bool { + sortedTables := make([]*table, len(tables)) + copy(sortedTables, tables) + lm.sortByStaleDataSize(sortedTables, cd) + + if len(sortedTables) > 0 && sortedTables[0].StaleDataSize() == 0 { + // This is a maxLevel to maxLevel compaction and we don't have any stale data. + return false + } + cd.bot = []*table{} + collectBotTables := func(t *table, needSz int64) { + totalSize := t.Size() + + j := sort.Search(len(tables), func(i int) bool { + return utils.CompareKeys(tables[i].ss.MinKey(), t.ss.MinKey()) >= 0 + }) + utils.CondPanic(tables[j].fid != t.fid, errors.New("tables[j].ID() != t.ID()")) + j++ + // Collect tables until we reach the the required size. + for j < len(tables) { + newT := tables[j] + totalSize += newT.Size() + + if totalSize >= needSz { + break + } + cd.bot = append(cd.bot, newT) + cd.nextRange.extend(getKeyRange(newT)) + j++ + } + } + now := time.Now() + for _, t := range sortedTables { + if now.Sub(*t.GetCreatedAt()) < time.Hour { + // Just created it an hour ago. Don't pick for compaction. + continue + } + // If the stale data size is less than 10 MB, it might not be worth + // rewriting the table. Skip it. + if t.StaleDataSize() < 10<<20 { + continue + } + + cd.thisSize = t.Size() + cd.thisRange = getKeyRange(t) + // Set the next range as the same as the current range. If we don't do + // this, we won't be able to run more than one max level compactions. + cd.nextRange = cd.thisRange + // If we're already compacting this range, don't do anything. + if lm.compactState.overlapsWith(cd.thisLevel.levelNum, cd.thisRange) { + continue + } + + // Found a valid table! + cd.top = []*table{t} + + needFileSz := cd.t.fileSz[cd.thisLevel.levelNum] + // 如果合并的sst size需要的文件尺寸直接终止 + if t.Size() >= needFileSz { + break + } + // TableSize is less than what we want. Collect more tables for compaction. + // If the level has multiple small tables, we collect all of them + // together to form a bigger table. + collectBotTables(t, needFileSz) + if !lm.compactState.compareAndAdd(thisAndNextLevelRLocked{}, *cd) { + cd.bot = cd.bot[:0] + cd.nextRange = keyRange{} + continue + } + return true + } + if len(cd.top) == 0 { + return false + } + + return lm.compactState.compareAndAdd(thisAndNextLevelRLocked{}, *cd) +} + +// fillTablesL0 先尝试从l0 到lbase的压缩,如果失败则对l0自己压缩 +func (lm *levelManager) fillTablesL0(cd *compactDef) bool { + if ok := lm.fillTablesL0ToLbase(cd); ok { + return true + } + return lm.fillTablesL0ToL0(cd) +} + +func (lm *levelManager) fillTablesL0ToLbase(cd *compactDef) bool { + if cd.nextLevel.levelNum == 0 { + utils.Panic(errors.New("base level can be zero")) + } + // 如果优先级低于1 则不执行 + if cd.p.adjusted > 0.0 && cd.p.adjusted < 1.0 { + // Do not compact to Lbase if adjusted score is less than 1.0. + return false + } + cd.lockLevels() + defer cd.unlockLevels() + + top := cd.thisLevel.tables + if len(top) == 0 { + return false + } + + var out []*table + var kr keyRange + // cd.top[0] 是最老的文件,从最老的文件开始 + for _, t := range top { + dkr := getKeyRange(t) + if kr.overlapsWith(dkr) { + out = append(out, t) + kr.extend(dkr) + } else { + // 如果有任何一个不重合的区间存在则直接终止 + break + } + } + // 获取目标range list 的全局 range 对象 + cd.thisRange = getKeyRange(out...) + cd.top = out + + left, right := cd.nextLevel.overlappingTables(levelHandlerRLocked{}, cd.thisRange) + cd.bot = make([]*table, right-left) + copy(cd.bot, cd.nextLevel.tables[left:right]) + + if len(cd.bot) == 0 { + cd.nextRange = cd.thisRange + } else { + cd.nextRange = getKeyRange(cd.bot...) + } + return lm.compactState.compareAndAdd(thisAndNextLevelRLocked{}, *cd) +} + +// fillTablesL0ToL0 l0到l0压缩 +func (lm *levelManager) fillTablesL0ToL0(cd *compactDef) bool { + if cd.compactorId != 0 { + // 只要0号压缩处理器可以执行,避免l0tol0的资源竞争 + return false + } + + cd.nextLevel = lm.levels[0] + cd.nextRange = keyRange{} + cd.bot = nil + + // TODO 这里是否会导致死锁? + utils.CondPanic(cd.thisLevel.levelNum != 0, errors.New("cd.thisLevel.levelNum != 0")) + utils.CondPanic(cd.nextLevel.levelNum != 0, errors.New("cd.nextLevel.levelNum != 0")) + lm.levels[0].RLock() + defer lm.levels[0].RUnlock() + + lm.compactState.Lock() + defer lm.compactState.Unlock() + + top := cd.thisLevel.tables + var out []*table + now := time.Now() + for _, t := range top { + if t.Size() >= 2*cd.t.fileSz[0] { + // 在L0 to L0 的压缩过程中,不要对过大的sst文件压缩,这会造成性能抖动 + continue + } + if now.Sub(*t.GetCreatedAt()) < 10*time.Second { + // 如果sst的创建时间不足10s 也不要回收 + continue + } + // 如果当前的sst 已经在压缩状态 也应该忽略 + if _, beingCompacted := lm.compactState.tables[t.fid]; beingCompacted { + continue + } + out = append(out, t) + } + + if len(out) < 4 { + // 满足条件的sst小于4个那就不压缩了 + return false + } + cd.thisRange = infRange + cd.top = out + + // 在这个过程中避免任何l0到其他层的合并 + thisLevel := lm.compactState.levels[cd.thisLevel.levelNum] + thisLevel.ranges = append(thisLevel.ranges, infRange) + for _, t := range out { + lm.compactState.tables[t.fid] = struct{}{} + } + + // l0 to l0的压缩最终都会压缩为一个文件,这大大减少了l0层文件数量,减少了读放大 + cd.t.fileSz[0] = math.MaxUint32 + return true +} + +// getKeyRange 返回一组sst的区间合并后的最大与最小值 +func getKeyRange(tables ...*table) keyRange { + if len(tables) == 0 { + return keyRange{} + } + minKey := tables[0].ss.MinKey() + maxKey := tables[0].ss.MaxKey() + for i := 1; i < len(tables); i++ { + if utils.CompareKeys(tables[i].ss.MinKey(), minKey) < 0 { + minKey = tables[i].ss.MinKey() + } + if utils.CompareKeys(tables[i].ss.MaxKey(), maxKey) > 0 { + maxKey = tables[i].ss.MaxKey() + } + } + + // We pick all the versions of the smallest and the biggest key. Note that version zero would + // be the rightmost key, considering versions are default sorted in descending order. + return keyRange{ + left: utils.KeyWithTs(utils.ParseKey(minKey), math.MaxUint64), + right: utils.KeyWithTs(utils.ParseKey(maxKey), 0), + } +} + +func iteratorsReversed(th []*table, opt *utils.Options) []utils.Iterator { + out := make([]utils.Iterator, 0, len(th)) + for i := len(th) - 1; i >= 0; i-- { + // This will increment the reference of the table handler. + out = append(out, th[i].NewIterator(opt)) + } + return out +} +func (lm *levelManager) updateDiscardStats(discardStats map[uint32]int64) { + select { + case *lm.lsm.option.DiscardStatsCh <- discardStats: + default: + } +} + +// 真正执行并行压缩的子压缩文件 +func (lm *levelManager) subcompact(it utils.Iterator, kr keyRange, cd compactDef, + inflightBuilders *utils.Throttle, res chan<- *table) { + var lastKey []byte + // 更新 discardStats + discardStats := make(map[uint32]int64) + defer func() { + lm.updateDiscardStats(discardStats) + }() + updateStats := func(e *utils.Entry) { + if e.Meta&utils.BitValuePointer > 0 { + var vp utils.ValuePtr + vp.Decode(e.Value) + discardStats[vp.Fid] += int64(vp.Len) + } + } + addKeys := func(builder *tableBuilder) { + var tableKr keyRange + for ; it.Valid(); it.Next() { + key := it.Item().Entry().Key + //version := utils.ParseTs(key) + isExpired := isDeletedOrExpired(0, it.Item().Entry().ExpiresAt) + if !utils.SameKey(key, lastKey) { + // 如果迭代器返回的key大于当前key的范围就不用执行了 + if len(kr.right) > 0 && utils.CompareKeys(key, kr.right) >= 0 { + break + } + if builder.ReachedCapacity() { + // 如果超过预估的sst文件大小,则直接结束 + break + } + // 把当前的key变为 lastKey + lastKey = utils.SafeCopy(lastKey, key) + //umVersions = 0 + // 如果左边界没有,则当前key给到左边界 + if len(tableKr.left) == 0 { + tableKr.left = utils.SafeCopy(tableKr.left, key) + } + // 更新右边界 + tableKr.right = lastKey + } + // TODO 这里要区分值的指针 + // 判断是否是过期内容,是的话就删除 + switch { + case isExpired: + updateStats(it.Item().Entry()) + builder.AddStaleKey(it.Item().Entry()) + default: + builder.AddKey(it.Item().Entry()) + } + } + } // End of function: addKeys + + //如果 key range left还存在 则seek到这里 说明遍历中途停止了 + if len(kr.left) > 0 { + it.Seek(kr.left) + } else { + // + it.Rewind() + } + for it.Valid() { + key := it.Item().Entry().Key + if len(kr.right) > 0 && utils.CompareKeys(key, kr.right) >= 0 { + break + } + // 拼装table创建的参数 + // TODO 这里可能要大改,对open table的参数复制一份opt + builder := newTableBuilerWithSSTSize(lm.opt, cd.t.fileSz[cd.nextLevel.levelNum]) + + // This would do the iteration and add keys to builder. + addKeys(builder) + + // It was true that it.Valid() at least once in the loop above, which means we + // called Add() at least once, and builder is not Empty(). + if builder.empty() { + // Cleanup builder resources: + builder.finish() + builder.Close() + continue + } + if err := inflightBuilders.Do(); err != nil { + // Can't return from here, until I decrRef all the tables that I built so far. + break + } + // 充分发挥 ssd的并行 写入特性 + go func(builder *tableBuilder) { + defer inflightBuilders.Done(nil) + defer builder.Close() + var tbl *table + newFID := atomic.AddUint64(&lm.maxFID, 1) // compact的时候是没有memtable的,这里自增maxFID即可。 + // TODO 这里的sst文件需要根据level大小变化 + sstName := utils.FileNameSSTable(lm.opt.WorkDir, newFID) + tbl = openTable(lm, sstName, builder) + if tbl == nil { + return + } + res <- tbl + }(builder) + } +} + +// checkOverlap 检查是否与下一层存在重合 +func (lm *levelManager) checkOverlap(tables []*table, lev int) bool { + kr := getKeyRange(tables...) + for i, lh := range lm.levels { + if i < lev { // Skip upper levels. + continue + } + lh.RLock() + left, right := lh.overlappingTables(levelHandlerRLocked{}, kr) + lh.RUnlock() + if right-left > 0 { + return true + } + } + return false +} + +// 判断是否过期 是可删除 +func isDeletedOrExpired(meta byte, expiresAt uint64) bool { + if expiresAt == 0 { + return false + } + return expiresAt <= uint64(time.Now().Unix()) +} + +// compactStatus +type compactStatus struct { + sync.RWMutex + levels []*levelCompactStatus + tables map[uint64]struct{} +} + +func (lsm *LSM) newCompactStatus() *compactStatus { + cs := &compactStatus{ + levels: make([]*levelCompactStatus, 0), + tables: make(map[uint64]struct{}), + } + for i := 0; i < lsm.option.MaxLevelNum; i++ { + cs.levels = append(cs.levels, &levelCompactStatus{}) + } + return cs +} + +func (cs *compactStatus) overlapsWith(level int, this keyRange) bool { + cs.RLock() + defer cs.RUnlock() + + thisLevel := cs.levels[level] + return thisLevel.overlapsWith(this) +} + +func (cs *compactStatus) delSize(l int) int64 { + cs.RLock() + defer cs.RUnlock() + return cs.levels[l].delSize +} + +func (cs *compactStatus) delete(cd compactDef) { + cs.Lock() + defer cs.Unlock() + + tl := cd.thisLevel.levelNum + + thisLevel := cs.levels[cd.thisLevel.levelNum] + nextLevel := cs.levels[cd.nextLevel.levelNum] + + thisLevel.delSize -= cd.thisSize + found := thisLevel.remove(cd.thisRange) + // The following check makes sense only if we're compacting more than one + // table. In case of the max level, we might rewrite a single table to + // remove stale data. + if cd.thisLevel != cd.nextLevel && !cd.nextRange.isEmpty() { + found = nextLevel.remove(cd.nextRange) && found + } + + if !found { + this := cd.thisRange + next := cd.nextRange + fmt.Printf("Looking for: %s in this level %d.\n", this, tl) + fmt.Printf("This Level:\n%s\n", thisLevel.debug()) + fmt.Println() + fmt.Printf("Looking for: %s in next level %d.\n", next, cd.nextLevel.levelNum) + fmt.Printf("Next Level:\n%s\n", nextLevel.debug()) + log.Fatal("keyRange not found") + } + for _, t := range append(cd.top, cd.bot...) { + _, ok := cs.tables[t.fid] + utils.CondPanic(!ok, fmt.Errorf("cs.tables is nil")) + delete(cs.tables, t.fid) + } +} + +func (cs *compactStatus) compareAndAdd(_ thisAndNextLevelRLocked, cd compactDef) bool { + cs.Lock() + defer cs.Unlock() + + tl := cd.thisLevel.levelNum + utils.CondPanic(tl >= len(cs.levels), fmt.Errorf("Got level %d. Max levels: %d", tl, len(cs.levels))) + thisLevel := cs.levels[cd.thisLevel.levelNum] + nextLevel := cs.levels[cd.nextLevel.levelNum] + + if thisLevel.overlapsWith(cd.thisRange) { + return false + } + if nextLevel.overlapsWith(cd.nextRange) { + return false + } + // Check whether this level really needs compaction or not. Otherwise, we'll end up + // running parallel compactions for the same level. + // Update: We should not be checking size here. Compaction priority already did the size checks. + // Here we should just be executing the wish of others. + + thisLevel.ranges = append(thisLevel.ranges, cd.thisRange) + nextLevel.ranges = append(nextLevel.ranges, cd.nextRange) + thisLevel.delSize += cd.thisSize + for _, t := range append(cd.top, cd.bot...) { + cs.tables[t.fid] = struct{}{} + } + return true +} + +// levelCompactStatus +type levelCompactStatus struct { + ranges []keyRange + delSize int64 +} + +func (lcs *levelCompactStatus) overlapsWith(dst keyRange) bool { + for _, r := range lcs.ranges { + if r.overlapsWith(dst) { + return true + } + } + return false +} +func (lcs *levelCompactStatus) remove(dst keyRange) bool { + final := lcs.ranges[:0] + var found bool + for _, r := range lcs.ranges { + if !r.equals(dst) { + final = append(final, r) + } else { + found = true + } + } + lcs.ranges = final + return found +} + +func (lcs *levelCompactStatus) debug() string { + var b bytes.Buffer + for _, r := range lcs.ranges { + b.WriteString(r.String()) + } + return b.String() +} + +// keyRange +type keyRange struct { + left []byte + right []byte + inf bool + size int64 // size is used for Key splits. +} + +func (r keyRange) isEmpty() bool { + return len(r.left) == 0 && len(r.right) == 0 && !r.inf +} + +var infRange = keyRange{inf: true} + +func (r keyRange) String() string { + return fmt.Sprintf("[left=%x, right=%x, inf=%v]", r.left, r.right, r.inf) +} + +func (r keyRange) equals(dst keyRange) bool { + return bytes.Equal(r.left, dst.left) && + bytes.Equal(r.right, dst.right) && + r.inf == dst.inf +} + +func (r *keyRange) extend(kr keyRange) { + // TODO(ibrahim): Is this needed? + if kr.isEmpty() { + return + } + if r.isEmpty() { + *r = kr + } + if len(r.left) == 0 || utils.CompareKeys(kr.left, r.left) < 0 { + r.left = kr.left + } + if len(r.right) == 0 || utils.CompareKeys(kr.right, r.right) > 0 { + r.right = kr.right + } + if kr.inf { + r.inf = true + } +} + +func (r keyRange) overlapsWith(dst keyRange) bool { + // Empty keyRange always overlaps. + if r.isEmpty() { + return true + } + // TODO(ibrahim): Do you need this? + // Empty dst doesn't overlap with anything. + if dst.isEmpty() { + return false + } + if r.inf || dst.inf { + return true + } + + // [dst.left, dst.right] ... [r.left, r.right] + // If my left is greater than dst right, we have no overlap. + if utils.CompareKeys(r.left, dst.right) > 0 { + return false + } + // [r.left, r.right] ... [dst.left, dst.right] + // If my right is less than dst left, we have no overlap. + if utils.CompareKeys(r.right, dst.left) < 0 { + return false + } + // We have overlap. + return true +} diff --git a/lsm/iterator.go b/lsm/iterator.go index e6010ec..dcb515a 100644 --- a/lsm/iterator.go +++ b/lsm/iterator.go @@ -1,32 +1,49 @@ +// Copyright 2021 logicrec Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package lsm import ( - "github.com/hardcore-os/corekv/iterator" - "github.com/hardcore-os/corekv/utils/codec" + "bytes" + "fmt" + "sort" + + "github.com/hardcore-os/corekv/utils" ) type Iterator struct { - it iterator.Item - iters []iterator.Iterator + it Item + iters []utils.Iterator } type Item struct { - e *codec.Entry + e *utils.Entry } -func (it *Item) Entry() *codec.Entry { +func (it *Item) Entry() *utils.Entry { return it.e } // 创建迭代器 -func (lsm *LSM) NewIterator(opt *iterator.Options) iterator.Iterator { +func (lsm *LSM) NewIterators(opt *utils.Options) []utils.Iterator { iter := &Iterator{} - iter.iters = make([]iterator.Iterator, 0) + iter.iters = make([]utils.Iterator, 0) iter.iters = append(iter.iters, lsm.memTable.NewIterator(opt)) for _, imm := range lsm.immutables { iter.iters = append(iter.iters, imm.NewIterator(opt)) } - iter.iters = append(iter.iters, lsm.levels.NewIterator(opt)) - return iter + iter.iters = append(iter.iters, lsm.levels.iterators()...) + return iter.iters } func (iter *Iterator) Next() { iter.iters[0].Next() @@ -37,19 +54,22 @@ func (iter *Iterator) Valid() bool { func (iter *Iterator) Rewind() { iter.iters[0].Rewind() } -func (iter *Iterator) Item() iterator.Item { +func (iter *Iterator) Item() utils.Item { return iter.iters[0].Item() } func (iter *Iterator) Close() error { return nil } +func (iter *Iterator) Seek(key []byte) { +} + // 内存表迭代器 type memIterator struct { - innerIter iterator.Iterator + innerIter utils.Iterator } -func (m *memTable) NewIterator(opt *iterator.Options) iterator.Iterator { +func (m *memTable) NewIterator(opt *utils.Options) utils.Iterator { return &memIterator{innerIter: m.sl.NewSkipListIterator()} } func (iter *memIterator) Next() { @@ -61,21 +81,23 @@ func (iter *memIterator) Valid() bool { func (iter *memIterator) Rewind() { iter.innerIter.Rewind() } -func (iter *memIterator) Item() iterator.Item { +func (iter *memIterator) Item() utils.Item { return iter.innerIter.Item() } func (iter *memIterator) Close() error { return iter.innerIter.Close() } +func (iter *memIterator) Seek(key []byte) { +} // levelManager上的迭代器 type levelIterator struct { - it *iterator.Item + it *utils.Item iters []*Iterator } -func (lm *levelManager) NewIterator(options *iterator.Options) iterator.Iterator { - return &levelIterator{} +func (lm *levelManager) NewIterators(options *utils.Options) []utils.Iterator { + return lm.iterators() } func (iter *levelIterator) Next() { } @@ -85,9 +107,333 @@ func (iter *levelIterator) Valid() bool { func (iter *levelIterator) Rewind() { } -func (iter *levelIterator) Item() iterator.Item { +func (iter *levelIterator) Item() utils.Item { return &Item{} } func (iter *levelIterator) Close() error { return nil } + +func (iter *levelIterator) Seek(key []byte) { +} + +// ConcatIterator 将table 数组链接成一个迭代器,这样迭代效率更高 +type ConcatIterator struct { + idx int // Which iterator is active now. + cur utils.Iterator + iters []utils.Iterator // Corresponds to tables. + tables []*table // Disregarding reversed, this is in ascending order. + options *utils.Options // Valid options are REVERSED and NOCACHE. +} + +// NewConcatIterator creates a new concatenated iterator +func NewConcatIterator(tbls []*table, opt *utils.Options) *ConcatIterator { + iters := make([]utils.Iterator, len(tbls)) + return &ConcatIterator{ + options: opt, + iters: iters, + tables: tbls, + idx: -1, // Not really necessary because s.it.Valid()=false, but good to have. + } +} + +func (s *ConcatIterator) setIdx(idx int) { + s.idx = idx + if idx < 0 || idx >= len(s.iters) { + s.cur = nil + return + } + if s.iters[idx] == nil { + s.iters[idx] = s.tables[idx].NewIterator(s.options) + } + s.cur = s.iters[s.idx] +} + +// Rewind implements Interface +func (s *ConcatIterator) Rewind() { + if len(s.iters) == 0 { + return + } + if !s.options.IsAsc { + s.setIdx(0) + } else { + s.setIdx(len(s.iters) - 1) + } + s.cur.Rewind() +} + +// Valid implements y.Interface +func (s *ConcatIterator) Valid() bool { + return s.cur != nil && s.cur.Valid() +} + +// Item _ +func (s *ConcatIterator) Item() utils.Item { + return s.cur.Item() +} + +// Seek brings us to element >= key if reversed is false. Otherwise, <= key. +func (s *ConcatIterator) Seek(key []byte) { + var idx int + if s.options.IsAsc { + idx = sort.Search(len(s.tables), func(i int) bool { + return utils.CompareKeys(s.tables[i].ss.MaxKey(), key) >= 0 + }) + } else { + n := len(s.tables) + idx = n - 1 - sort.Search(n, func(i int) bool { + return utils.CompareKeys(s.tables[n-1-i].ss.MinKey(), key) <= 0 + }) + } + if idx >= len(s.tables) || idx < 0 { + s.setIdx(-1) + return + } + // For reversed=false, we know s.tables[i-1].Biggest() < key. Thus, the + // previous table cannot possibly contain key. + s.setIdx(idx) + s.cur.Seek(key) +} + +// Next advances our concat iterator. +func (s *ConcatIterator) Next() { + s.cur.Next() + if s.cur.Valid() { + // Nothing to do. Just stay with the current table. + return + } + for { // In case there are empty tables. + if !s.options.IsAsc { + s.setIdx(s.idx + 1) + } else { + s.setIdx(s.idx - 1) + } + if s.cur == nil { + // End of list. Valid will become false. + return + } + s.cur.Rewind() + if s.cur.Valid() { + break + } + } +} + +// Close implements y.Interface. +func (s *ConcatIterator) Close() error { + for _, it := range s.iters { + if it == nil { + continue + } + if err := it.Close(); err != nil { + return fmt.Errorf("ConcatIterator:%+v", err) + } + } + return nil +} + +// MergeIterator 多路合并迭代器 +// NOTE: MergeIterator owns the array of iterators and is responsible for closing them. +type MergeIterator struct { + left node + right node + small *node + + curKey []byte + reverse bool +} + +type node struct { + valid bool + entry *utils.Entry + iter utils.Iterator + + // The two iterators are type asserted from `y.Iterator`, used to inline more function calls. + // Calling functions on concrete types is much faster (about 25-30%) than calling the + // interface's function. + merge *MergeIterator + concat *ConcatIterator +} + +func (n *node) setIterator(iter utils.Iterator) { + n.iter = iter + // It's okay if the type assertion below fails and n.merge/n.concat are set to nil. + // We handle the nil values of merge and concat in all the methods. + n.merge, _ = iter.(*MergeIterator) + n.concat, _ = iter.(*ConcatIterator) +} + +func (n *node) setKey() { + switch { + case n.merge != nil: + n.valid = n.merge.small.valid + if n.valid { + n.entry = n.merge.small.entry + } + case n.concat != nil: + n.valid = n.concat.Valid() + if n.valid { + n.entry = n.concat.Item().Entry() + } + default: + n.valid = n.iter.Valid() + if n.valid { + n.entry = n.iter.Item().Entry() + } + } +} + +func (n *node) next() { + switch { + case n.merge != nil: + n.merge.Next() + case n.concat != nil: + n.concat.Next() + default: + n.iter.Next() + } + n.setKey() +} + +func (n *node) rewind() { + n.iter.Rewind() + n.setKey() +} + +func (n *node) seek(key []byte) { + n.iter.Seek(key) + n.setKey() +} + +func (mi *MergeIterator) fix() { + if !mi.bigger().valid { + return + } + if !mi.small.valid { + mi.swapSmall() + return + } + cmp := utils.CompareKeys(mi.small.entry.Key, mi.bigger().entry.Key) + switch { + case cmp == 0: // Both the keys are equal. + // In case of same keys, move the right iterator ahead. + mi.right.next() + if &mi.right == mi.small { + mi.swapSmall() + } + return + case cmp < 0: // Small is less than bigger(). + if mi.reverse { + mi.swapSmall() + } else { + // we don't need to do anything. Small already points to the smallest. + } + return + default: // bigger() is less than small. + if mi.reverse { + // Do nothing since we're iterating in reverse. Small currently points to + // the bigger key and that's okay in reverse iteration. + } else { + mi.swapSmall() + } + return + } +} + +func (mi *MergeIterator) bigger() *node { + if mi.small == &mi.left { + return &mi.right + } + return &mi.left +} + +func (mi *MergeIterator) swapSmall() { + if mi.small == &mi.left { + mi.small = &mi.right + return + } + if mi.small == &mi.right { + mi.small = &mi.left + return + } +} + +// Next returns the next element. If it is the same as the current key, ignore it. +func (mi *MergeIterator) Next() { + for mi.Valid() { + if !bytes.Equal(mi.small.entry.Key, mi.curKey) { + break + } + mi.small.next() + mi.fix() + } + mi.setCurrent() +} + +func (mi *MergeIterator) setCurrent() { + utils.CondPanic(mi.small.entry == nil && mi.small.valid == true, fmt.Errorf("mi.small.entry is nil")) + if mi.small.valid { + mi.curKey = append(mi.curKey[:0], mi.small.entry.Key...) + } +} + +// Rewind seeks to first element (or last element for reverse iterator). +func (mi *MergeIterator) Rewind() { + mi.left.rewind() + mi.right.rewind() + mi.fix() + mi.setCurrent() +} + +// Seek brings us to element with key >= given key. +func (mi *MergeIterator) Seek(key []byte) { + mi.left.seek(key) + mi.right.seek(key) + mi.fix() + mi.setCurrent() +} + +// Valid returns whether the MergeIterator is at a valid element. +func (mi *MergeIterator) Valid() bool { + return mi.small.valid +} + +// Key returns the key associated with the current iterator. +func (mi *MergeIterator) Item() utils.Item { + return mi.small.iter.Item() +} + +// Close implements Iterator. +func (mi *MergeIterator) Close() error { + err1 := mi.left.iter.Close() + err2 := mi.right.iter.Close() + if err1 != nil { + return utils.WarpErr("MergeIterator", err1) + } + return utils.WarpErr("MergeIterator", err2) +} + +// NewMergeIterator creates a merge iterator. +func NewMergeIterator(iters []utils.Iterator, reverse bool) utils.Iterator { + switch len(iters) { + case 0: + return &Iterator{} + case 1: + return iters[0] + case 2: + mi := &MergeIterator{ + reverse: reverse, + } + mi.left.setIterator(iters[0]) + mi.right.setIterator(iters[1]) + // Assign left iterator randomly. This will be fixed when user calls rewind/seek. + mi.small = &mi.left + return mi + } + mid := len(iters) / 2 + return NewMergeIterator( + []utils.Iterator{ + NewMergeIterator(iters[:mid], reverse), + NewMergeIterator(iters[mid:], reverse), + }, reverse) +} diff --git a/lsm/levels.go b/lsm/levels.go index 9125ee7..4f8bf41 100644 --- a/lsm/levels.go +++ b/lsm/levels.go @@ -1,41 +1,43 @@ package lsm import ( + "bytes" + "sort" + "sync" + "sync/atomic" + "github.com/hardcore-os/corekv/file" "github.com/hardcore-os/corekv/utils" - "github.com/hardcore-os/corekv/utils/codec" ) -type levelManager struct { - opt *Options - cache *cache - manifest *file.Manifest - levels []*levelHandler -} - -type levelHandler struct { - levelNum int - tables []*table +// initLevelManager 初始化函数 +func (lsm *LSM) initLevelManager(opt *Options) *levelManager { + lm := &levelManager{lsm: lsm} // 反引用 + lm.compactState = lsm.newCompactStatus() + lm.opt = opt + // 读取manifest文件构建管理器 + if err := lm.loadManifest(); err != nil { + panic(err) + } + lm.build() + return lm } -func (lh *levelHandler) close() error { - return nil +type levelManager struct { + maxFID uint64 // 已经分配出去的最大fid,只要创建了memtable 就算已分配 + opt *Options + cache *cache + manifestFile *file.ManifestFile + levels []*levelHandler + lsm *LSM + compactState *compactStatus } -func (lh *levelHandler) Get(key []byte) (*codec.Entry, error) { - // 如果是第0层文件则进行特殊处理 - if lh.levelNum == 0 { - // logic... - } else { - // logic... - } - return nil, nil -} func (lm *levelManager) close() error { if err := lm.cache.close(); err != nil { return err } - if err := lm.manifest.Close(); err != nil { + if err := lm.manifestFile.Close(); err != nil { return err } for i := range lm.levels { @@ -46,51 +48,18 @@ func (lm *levelManager) close() error { return nil } -func newLevelManager(opt *Options) *levelManager { - lm := &levelManager{} - lm.opt = opt - // 读取manifest文件构建管理器 - lm.loadManifest() - lm.build() - return lm -} -func (lm *levelManager) loadCache() { - lm.cache = newCache(lm.opt) - // 添加 idx cache +func (lm *levelManager) iterators() []utils.Iterator { + + itrs := make([]utils.Iterator, 0, len(lm.levels)) for _, level := range lm.levels { - for _, table := range level.tables { - lm.cache.addIndex(table.ss.FID(), table) - } - } -} -func (lm *levelManager) loadManifest() { - lm.manifest = file.OpenManifest(&file.Options{Name: "manifest", Dir: lm.opt.WorkDir}) -} -func (lm *levelManager) build() { - // 如果manifest文件是空的 则进行初始化 - lm.levels = make([]*levelHandler, utils.MaxLevelNum) - tables := lm.manifest.Tables() - for num := 0; num < utils.MaxLevelNum; num++ { - lm.levels[num] = &levelHandler{levelNum: num} - lm.levels[num].tables = make([]*table, len(tables[num])) - for i := range tables[num] { - lm.levels[num].tables[i] = openTable(lm.opt, tables[num][i]) - } + itrs = append(itrs, level.iterators()...) } - // 逐一加载sstable 的index block 构建cache - lm.loadCache() + return itrs } -// 向L0层flush一个sstable -func (lm *levelManager) flush(immutable *memTable) error { - // flush 跳表中的数据转化为sst文件 - // 删除wal文件并创建一个新的wal文件 - return nil -} - -func (lm *levelManager) Get(key []byte) (*codec.Entry, error) { +func (lm *levelManager) Get(key []byte) (*utils.Entry, error) { var ( - entry *codec.Entry + entry *utils.Entry err error ) // L0层查询 @@ -98,11 +67,290 @@ func (lm *levelManager) Get(key []byte) (*codec.Entry, error) { return entry, err } // L1-7层查询 - for level := 1; level < utils.MaxLevelNum; level++ { + for level := 1; level < lm.opt.MaxLevelNum; level++ { ld := lm.levels[level] if entry, err = ld.Get(key); entry != nil { return entry, err } } - return entry, nil + return entry, utils.ErrKeyNotFound +} + +func (lm *levelManager) loadCache() { + +} +func (lm *levelManager) loadManifest() (err error) { + lm.manifestFile, err = file.OpenManifestFile(&file.Options{Dir: lm.opt.WorkDir}) + return err +} +func (lm *levelManager) build() error { + lm.levels = make([]*levelHandler, 0, lm.opt.MaxLevelNum) + for i := 0; i < lm.opt.MaxLevelNum; i++ { + lm.levels = append(lm.levels, &levelHandler{ + levelNum: i, + tables: make([]*table, 0), + lm: lm, + }) + } + + manifest := lm.manifestFile.GetManifest() + // 对比manifest 文件的正确性 + if err := lm.manifestFile.RevertToManifest(utils.LoadIDMap(lm.opt.WorkDir)); err != nil { + return err + } + // 逐一加载sstable 的index block 构建cache + lm.cache = newCache(lm.opt) + // TODO 初始化的时候index 结构放在了table中,相当于全部加载到了内存,减少了一次读磁盘,但增加了内存消耗 + var maxFID uint64 + for fID, tableInfo := range manifest.Tables { + fileName := utils.FileNameSSTable(lm.opt.WorkDir, fID) + if fID > maxFID { + maxFID = fID + } + t := openTable(lm, fileName, nil) + lm.levels[tableInfo.Level].add(t) + lm.levels[tableInfo.Level].addSize(t) // 记录一个level的文件总大小 + } + // 对每一层进行排序 + for i := 0; i < lm.opt.MaxLevelNum; i++ { + lm.levels[i].Sort() + } + // 得到最大的fid值 + atomic.AddUint64(&lm.maxFID, maxFID) + return nil +} + +// 向L0层flush一个sstable +func (lm *levelManager) flush(immutable *memTable) (err error) { + // 分配一个fid + fid := immutable.wal.Fid() + sstName := utils.FileNameSSTable(lm.opt.WorkDir, fid) + + // 构建一个 builder + builder := newTableBuiler(lm.opt) + iter := immutable.sl.NewSkipListIterator() + for iter.Rewind(); iter.Valid(); iter.Next() { + entry := iter.Item().Entry() + builder.add(entry, false) + } + // 创建一个 table 对象 + table := openTable(lm, sstName, builder) + err = lm.manifestFile.AddTableMeta(0, &file.TableMeta{ + ID: fid, + Checksum: []byte{'m', 'o', 'c', 'k'}, + }) + // manifest写入失败直接panic + utils.Panic(err) + // 更新manifest文件 + lm.levels[0].add(table) + return +} + +//--------- level处理器 ------- +type levelHandler struct { + sync.RWMutex + levelNum int + tables []*table + totalSize int64 + totalStaleSize int64 + lm *levelManager +} + +func (lh *levelHandler) close() error { + for i := range lh.tables { + if err := lh.tables[i].ss.Close(); err != nil { + return err + } + } + return nil +} +func (lh *levelHandler) add(t *table) { + lh.Lock() + defer lh.Unlock() + lh.tables = append(lh.tables, t) +} +func (lh *levelHandler) addBatch(ts []*table) { + lh.Lock() + defer lh.Unlock() + lh.tables = append(lh.tables, ts...) +} + +func (lh *levelHandler) getTotalSize() int64 { + lh.RLock() + defer lh.RUnlock() + return lh.totalSize +} + +func (lh *levelHandler) addSize(t *table) { + lh.totalSize += t.Size() + lh.totalStaleSize += int64(t.StaleDataSize()) +} + +func (lh *levelHandler) subtractSize(t *table) { + lh.totalSize -= t.Size() + lh.totalStaleSize -= int64(t.StaleDataSize()) +} + +func (lh *levelHandler) numTables() int { + lh.RLock() + defer lh.RUnlock() + return len(lh.tables) +} + +func (lh *levelHandler) Get(key []byte) (*utils.Entry, error) { + // 如果是第0层文件则进行特殊处理 + if lh.levelNum == 0 { + // TODO: logic... + // 获取可能存在key的sst + return lh.searchL0SST(key) + } else { + // TODO: logic... + return lh.searchLNSST(key) + } +} + +func (lh *levelHandler) Sort() { + lh.Lock() + defer lh.Unlock() + if lh.levelNum == 0 { + // Key range will overlap. Just sort by fileID in ascending order + // because newer tables are at the end of level 0. + sort.Slice(lh.tables, func(i, j int) bool { + return lh.tables[i].fid < lh.tables[j].fid + }) + } else { + // Sort tables by keys. + sort.Slice(lh.tables, func(i, j int) bool { + return utils.CompareKeys(lh.tables[i].ss.MinKey(), lh.tables[j].ss.MinKey()) < 0 + }) + } +} + +func (lh *levelHandler) searchL0SST(key []byte) (*utils.Entry, error) { + var version uint64 + for _, table := range lh.tables { + if entry, err := table.Serach(key, &version); err == nil { + return entry, nil + } + } + return nil, utils.ErrKeyNotFound +} +func (lh *levelHandler) searchLNSST(key []byte) (*utils.Entry, error) { + table := lh.getTable(key) + var version uint64 + if table == nil { + return nil, utils.ErrKeyNotFound + } + if entry, err := table.Serach(key, &version); err == nil { + return entry, nil + } + return nil, utils.ErrKeyNotFound +} +func (lh *levelHandler) getTable(key []byte) *table { + for i := len(lh.tables) - 1; i >= 0; i-- { + if bytes.Compare(key, lh.tables[i].ss.MinKey()) > -1 && + bytes.Compare(key, lh.tables[i].ss.MaxKey()) < 1 { + return lh.tables[i] + } + } + return nil +} +func (lh *levelHandler) isLastLevel() bool { + return lh.levelNum == lh.lm.opt.MaxLevelNum-1 +} + +type levelHandlerRLocked struct{} + +// overlappingTables returns the tables that intersect with key range. Returns a half-interval. +// This function should already have acquired a read lock, and this is so important the caller must +// pass an empty parameter declaring such. +func (lh *levelHandler) overlappingTables(_ levelHandlerRLocked, kr keyRange) (int, int) { + if len(kr.left) == 0 || len(kr.right) == 0 { + return 0, 0 + } + left := sort.Search(len(lh.tables), func(i int) bool { + return utils.CompareKeys(kr.left, lh.tables[i].ss.MaxKey()) <= 0 + }) + right := sort.Search(len(lh.tables), func(i int) bool { + return utils.CompareKeys(kr.right, lh.tables[i].ss.MaxKey()) < 0 + }) + return left, right +} + +// replaceTables will replace tables[left:right] with newTables. Note this EXCLUDES tables[right]. +// You must call decr() to delete the old tables _after_ writing the update to the manifest. +func (lh *levelHandler) replaceTables(toDel, toAdd []*table) error { + // Need to re-search the range of tables in this level to be replaced as other goroutines might + // be changing it as well. (They can't touch our tables, but if they add/remove other tables, + // the indices get shifted around.) + lh.Lock() // We s.Unlock() below. + + toDelMap := make(map[uint64]struct{}) + for _, t := range toDel { + toDelMap[t.fid] = struct{}{} + } + var newTables []*table + for _, t := range lh.tables { + _, found := toDelMap[t.fid] + if !found { + newTables = append(newTables, t) + continue + } + lh.subtractSize(t) + } + + // Increase totalSize first. + for _, t := range toAdd { + lh.addSize(t) + t.IncrRef() + newTables = append(newTables, t) + } + + // Assign tables. + lh.tables = newTables + sort.Slice(lh.tables, func(i, j int) bool { + return utils.CompareKeys(lh.tables[i].ss.MinKey(), lh.tables[i].ss.MinKey()) < 0 + }) + lh.Unlock() // s.Unlock before we DecrRef tables -- that can be slow. + return decrRefs(toDel) +} + +// deleteTables remove tables idx0, ..., idx1-1. +func (lh *levelHandler) deleteTables(toDel []*table) error { + lh.Lock() // s.Unlock() below + + toDelMap := make(map[uint64]struct{}) + for _, t := range toDel { + toDelMap[t.fid] = struct{}{} + } + + // Make a copy as iterators might be keeping a slice of tables. + var newTables []*table + for _, t := range lh.tables { + _, found := toDelMap[t.fid] + if !found { + newTables = append(newTables, t) + continue + } + lh.subtractSize(t) + } + lh.tables = newTables + + lh.Unlock() // Unlock s _before_ we DecrRef our tables, which can be slow. + + return decrRefs(toDel) +} + +func (lh *levelHandler) iterators() []utils.Iterator { + lh.RLock() + defer lh.RUnlock() + topt := &utils.Options{IsAsc: true} + if lh.levelNum == 0 { + return iteratorsReversed(lh.tables, topt) + } + + if len(lh.tables) == 0 { + return nil + } + return []utils.Iterator{NewConcatIterator(lh.tables, topt)} } diff --git a/lsm/lsm.go b/lsm/lsm.go index 7cec49b..ba13ecc 100644 --- a/lsm/lsm.go +++ b/lsm/lsm.go @@ -2,27 +2,50 @@ package lsm import ( "github.com/hardcore-os/corekv/utils" - "github.com/hardcore-os/corekv/utils/codec" ) +// LSM _ type LSM struct { memTable *memTable immutables []*memTable levels *levelManager option *Options closer *utils.Closer + maxMemFID uint32 } -//Options +//Options _ type Options struct { WorkDir string MemTableSize int64 + SSTableMaxSz int64 + // BlockSize is the size of each block inside SSTable in bytes. + BlockSize int + // BloomFalsePositive is the false positive probabiltiy of bloom filter. + BloomFalsePositive float64 + + // compact + NumCompactors int + BaseLevelSize int64 + LevelSizeMultiplier int // 决定level之间期望的size比例 + TableSizeMultiplier int + BaseTableSize int64 + NumLevelZeroTables int + MaxLevelNum int + + DiscardStatsCh *chan map[uint32]int64 } -// 关闭lsm +// Close _ func (lsm *LSM) Close() error { - if err := lsm.memTable.close(); err != nil { - return err + // 等待全部合并过程的结束 + // 等待全部api调用过程结束 + lsm.closer.Close() + // TODO 需要加锁保证并发安全 + if lsm.memTable != nil { + if err := lsm.memTable.close(); err != nil { + return err + } } for i := range lsm.immutables { if err := lsm.immutables[i].close(); err != nil { @@ -32,70 +55,102 @@ func (lsm *LSM) Close() error { if err := lsm.levels.close(); err != nil { return err } - // 等待合并过程的结束 - lsm.closer.Close() return nil } -// NewLSM +// NewLSM _ func NewLSM(opt *Options) *LSM { lsm := &LSM{option: opt} - // 启动DB恢复过程加载wal,如果没有恢复内容则创建新的内存表 - lsm.memTable, lsm.immutables = recovery(opt) // 初始化levelManager - lsm.levels = newLevelManager(opt) + lsm.levels = lsm.initLevelManager(opt) + // 启动DB恢复过程加载wal,如果没有恢复内容则创建新的内存表 + lsm.memTable, lsm.immutables = lsm.recovery() // 初始化closer 用于资源回收的信号控制 - lsm.closer = utils.NewCloser(1) + lsm.closer = utils.NewCloser() return lsm } -// StartMerge -func (lsm *LSM) StartMerge() { - defer lsm.closer.Done() - for { - select { - case <-lsm.closer.Wait(): - } - // 处理并发的合并过程 +// StartCompacter _ +func (lsm *LSM) StartCompacter() { + n := lsm.option.NumCompactors + lsm.closer.Add(n) + for i := 0; i < n; i++ { + go lsm.levels.runCompacter(i) } } -func (lsm *LSM) Set(entry *codec.Entry) (err error) { +// Set _ +func (lsm *LSM) Set(entry *utils.Entry) (err error) { + if entry == nil || len(entry.Key) == 0 { + return utils.ErrEmptyKey + } + // 优雅关闭 + lsm.closer.Add(1) + defer lsm.closer.Done() // 检查当前memtable是否写满,是的话创建新的memtable,并将当前内存表写到immutables中 // 否则写入当前memtable中 - if lsm.memTable.Size() > lsm.option.MemTableSize { - lsm.immutables = append(lsm.immutables, lsm.memTable) - if lsm.memTable, err = NewMemtable(); err != nil { - return err - } + if int64(lsm.memTable.wal.Size())+ + int64(utils.EstimateWalCodecSize(entry)) > lsm.option.MemTableSize { + lsm.Rotate() } - if err := lsm.memTable.set(entry); err != nil { + if err = lsm.memTable.set(entry); err != nil { return err } // 检查是否存在immutable需要刷盘, for _, immutable := range lsm.immutables { - if err := lsm.levels.flush(immutable); err != nil { + if err = lsm.levels.flush(immutable); err != nil { return err } + // TODO 这里问题很大,应该是用引用计数的方式回收 + err = immutable.close() + utils.Panic(err) } - return nil + if len(lsm.immutables) != 0 { + // TODO 将lsm的immutables队列置空,这里可以优化一下节省内存空间,还可以限制一下immut table的大小为固定值 + lsm.immutables = make([]*memTable, 0) + } + return err } -func (lsm *LSM) Get(key []byte) (*codec.Entry, error) { +// Get _ +func (lsm *LSM) Get(key []byte) (*utils.Entry, error) { + if len(key) == 0 { + return nil, utils.ErrEmptyKey + } + lsm.closer.Add(1) + defer lsm.closer.Done() var ( - entry *codec.Entry + entry *utils.Entry err error ) // 从内存表中查询,先查活跃表,在查不变表 - if entry, err = lsm.memTable.Get(key); entry != nil { + if entry, err = lsm.memTable.Get(key); entry != nil && entry.Value != nil { return entry, err } - for _, imm := range lsm.immutables { - if entry, err = imm.Get(key); entry != nil { + + for i := len(lsm.immutables) - 1; i >= 0; i-- { + if entry, err = lsm.immutables[i].Get(key); entry != nil && entry.Value != nil { return entry, err } } // 从level manger查询 return lsm.levels.Get(key) } + +func (lsm *LSM) MemSize() int64 { + return lsm.memTable.Size() +} + +func (lsm *LSM) MemTableIsNil() bool { + return lsm.memTable == nil +} + +func (lsm *LSM) GetSkipListFromMemTable() *utils.Skiplist { + return lsm.memTable.sl +} + +func (lsm *LSM) Rotate() { + lsm.immutables = append(lsm.immutables, lsm.memTable) + lsm.memTable = lsm.NewMemtable() +} diff --git a/lsm/lsm_test.go b/lsm/lsm_test.go index ed5eb1b..4f4c5e0 100644 --- a/lsm/lsm_test.go +++ b/lsm/lsm_test.go @@ -1,56 +1,344 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package lsm import ( + "bytes" + "fmt" + "os" "testing" + "time" - "github.com/hardcore-os/corekv/file" "github.com/hardcore-os/corekv/utils" - "github.com/hardcore-os/corekv/utils/codec" - "github.com/stretchr/testify/assert" ) -// 对level 管理器的功能测试 -func TestLevels(t *testing.T) { - entrys := []*codec.Entry{ - {Key: []byte("hello0"), Value: []byte("world0"), ExpiresAt: uint64(0)}, - {Key: []byte("hello1"), Value: []byte("world1"), ExpiresAt: uint64(0)}, - {Key: []byte("hello2"), Value: []byte("world2"), ExpiresAt: uint64(0)}, - {Key: []byte("hello3"), Value: []byte("world3"), ExpiresAt: uint64(0)}, - {Key: []byte("hello4"), Value: []byte("world4"), ExpiresAt: uint64(0)}, - {Key: []byte("hello5"), Value: []byte("world5"), ExpiresAt: uint64(0)}, - {Key: []byte("hello6"), Value: []byte("world6"), ExpiresAt: uint64(0)}, - {Key: []byte("hello7"), Value: []byte("world"), ExpiresAt: uint64(0)}, - } +var ( // 初始化opt - opt := &Options{ - "../work_test", - } - levelLive := func() { - // 初始化 - levels := newLevelManager(opt) - defer func() { _ = levels.close() }() - // 构建内存表 - imm := &memTable{ - wal: file.OpenWalFile(&file.Options{}), - sl: utils.NewSkipList(), + opt = &Options{ + WorkDir: "../work_test", + SSTableMaxSz: 1024, + MemTableSize: 1024, + BlockSize: 1024, + BloomFalsePositive: 0, + BaseLevelSize: 10 << 20, + LevelSizeMultiplier: 10, + BaseTableSize: 2 << 20, + TableSizeMultiplier: 2, + NumLevelZeroTables: 15, + MaxLevelNum: 7, + NumCompactors: 3, + } +) + +// TestBase 正确性测试 +func TestBase(t *testing.T) { + clearDir() + lsm := buildLSM() + test := func() { + // 基准测试 + baseTest(t, lsm, 128) + } + // 运行N次测试多个sst的影响 + runTest(1, test) +} + +// TestRecovery 数据库恢复测试 +func TestRecovery(t *testing.T) { + clearDir() + recovery := func() { + // 每次运行都是相当于意外重启 + lsm := buildLSM() + // 测试正确性 + baseTest(t, lsm, 128) + } + // 允许两次就能实现恢复 + runTest(5, recovery) +} + +// TestClose 测试优雅关闭 +func TestClose(t *testing.T) { + clearDir() + lsm := buildLSM() + lsm.StartCompacter() + test := func() { + baseTest(t, lsm, 128) + utils.Err(lsm.Close()) + // 重启后可正常工作才算成功 + lsm = buildLSM() + baseTest(t, lsm, 128) + } + // 运行N次测试多个sst的影响 + runTest(1, test) +} + +// 命中不同存储介质的逻辑分支测试 +func TestHitStorage(t *testing.T) { + clearDir() + lsm := buildLSM() + e := utils.BuildEntry() + lsm.Set(e) + // 命中内存表 + hitMemtable := func() { + v, err := lsm.memTable.Get(e.Key) + utils.Err(err) + utils.CondPanic(!bytes.Equal(v.Value, e.Value), fmt.Errorf("[hitMemtable] !equal(v.Value, e.Value)")) + } + // 命中L0层 + hitL0 := func() { + // baseTest的测试就包含 在命中L0的sst查询 + baseTest(t, lsm, 128) + } + // 命中非L0层 + hitNotL0 := func() { + // 通过压缩将compact生成非L0数据, 会命中l6层 + lsm.levels.runOnce(0) + baseTest(t, lsm, 128) + } + // 命中bf + hitBloom := func() { + ee := utils.BuildEntry() + // 查询不存在的key 如果命中则说明一定不存在 + v, err := lsm.levels.levels[0].tables[0].Serach(ee.Key, &ee.Version) + utils.CondPanic(v != nil, fmt.Errorf("[hitBloom] v != nil")) + utils.CondPanic(err != utils.ErrKeyNotFound, fmt.Errorf("[hitBloom] err != utils.ErrKeyNotFound")) + } + + runTest(1, hitMemtable, hitL0, hitNotL0, hitBloom) +} + +// Testparameter 测试异常参数 +func TestPsarameter(t *testing.T) { + clearDir() + lsm := buildLSM() + testNil := func() { + utils.CondPanic(lsm.Set(nil) != utils.ErrEmptyKey, fmt.Errorf("[testNil] lsm.Set(nil) != err")) + _, err := lsm.Get(nil) + utils.CondPanic(err != utils.ErrEmptyKey, fmt.Errorf("[testNil] lsm.Set(nil) != err")) + } + // TODO p2 优先级的case先忽略 + runTest(1, testNil) +} + +// TestCompact 测试L0到Lmax压缩 +func TestCompact(t *testing.T) { + clearDir() + lsm := buildLSM() + ok := false + l0TOLMax := func() { + // 正常触发即可 + baseTest(t, lsm, 128) + // 直接触发压缩执行 + fid := lsm.levels.maxFID + 1 + lsm.levels.runOnce(1) + for _, t := range lsm.levels.levels[6].tables { + if t.fid == fid { + ok = true + } } - for _, entry := range entrys { - imm.set(entry) + utils.CondPanic(!ok, fmt.Errorf("[l0TOLMax] fid not found")) + } + l0ToL0 := func() { + // 先写一些数据进来 + baseTest(t, lsm, 128) + fid := lsm.levels.maxFID + 1 + cd := buildCompactDef(lsm, 0, 0, 0) + // 非常tricky的处理方法,为了能通过检查 + tricky(cd.thisLevel.tables) + ok := lsm.levels.fillTablesL0ToL0(cd) + utils.CondPanic(!ok, fmt.Errorf("[l0ToL0] lsm.levels.fillTablesL0ToL0(cd) ret == false")) + err := lsm.levels.runCompactDef(0, 0, *cd) + // 删除全局状态,便于下游测试逻辑 + lsm.levels.compactState.delete(*cd) + utils.Err(err) + ok = false + for _, t := range lsm.levels.levels[0].tables { + if t.fid == fid { + ok = true + } } - // 测试 flush - assert.Nil(t, levels.flush(imm)) - // 从levels中进行GET - v, err := levels.Get([]byte("Hello")) - assert.Nil(t, err) - assert.Equal(t, codec.Entry{Value: []byte("Corekv")}.Value, v) - t.Logf("levels.Get key=%s, value=%s, expiresAt=%d", v.Key, v.Value, v.Value) - // 关闭levels - assert.Nil(t, levels.close()) + utils.CondPanic(!ok, fmt.Errorf("[l0ToL0] fid not found")) + } + nextCompact := func() { + baseTest(t, lsm, 128) + fid := lsm.levels.maxFID + 1 + cd := buildCompactDef(lsm, 0, 0, 1) + // 非常tricky的处理方法,为了能通过检查 + tricky(cd.thisLevel.tables) + ok := lsm.levels.fillTables(cd) + utils.CondPanic(!ok, fmt.Errorf("[nextCompact] lsm.levels.fillTables(cd) ret == false")) + err := lsm.levels.runCompactDef(0, 0, *cd) + // 删除全局状态,便于下游测试逻辑 + lsm.levels.compactState.delete(*cd) + utils.Err(err) + ok = false + for _, t := range lsm.levels.levels[1].tables { + if t.fid == fid { + ok = true + } + } + utils.CondPanic(!ok, fmt.Errorf("[nextCompact] fid not found")) + } + + maxToMax := func() { + baseTest(t, lsm, 128) + fid := lsm.levels.maxFID + 1 + cd := buildCompactDef(lsm, 6, 6, 6) + // 非常tricky的处理方法,为了能通过检查 + tricky(cd.thisLevel.tables) + ok := lsm.levels.fillTables(cd) + utils.CondPanic(!ok, fmt.Errorf("[maxToMax] lsm.levels.fillTables(cd) ret == false")) + err := lsm.levels.runCompactDef(0, 6, *cd) + // 删除全局状态,便于下游测试逻辑 + lsm.levels.compactState.delete(*cd) + utils.Err(err) + ok = false + for _, t := range lsm.levels.levels[6].tables { + if t.fid == fid { + ok = true + } + } + utils.CondPanic(!ok, fmt.Errorf("[maxToMax] fid not found")) + } + parallerCompact := func() { + baseTest(t, lsm, 128) + cd := buildCompactDef(lsm, 0, 0, 1) + // 非常tricky的处理方法,为了能通过检查 + tricky(cd.thisLevel.tables) + ok := lsm.levels.fillTables(cd) + utils.CondPanic(!ok, fmt.Errorf("[parallerCompact] lsm.levels.fillTables(cd) ret == false")) + // 构建完全相同两个压缩计划的执行,以便于百分比构建 压缩冲突 + go lsm.levels.runCompactDef(0, 0, *cd) + lsm.levels.runCompactDef(0, 0, *cd) + // 检查compact status状态查看是否在执行并行压缩 + isParaller := false + for _, state := range lsm.levels.compactState.levels { + if len(state.ranges) != 0 { + isParaller = true + } + } + utils.CondPanic(!isParaller, fmt.Errorf("[parallerCompact] not is paralle")) } // 运行N次测试多个sst的影响 - for i := 0; i < 10; i++ { - levelLive() + runTest(1, l0TOLMax, l0ToL0, nextCompact, maxToMax, parallerCompact) +} + +// 正确性测试 +func baseTest(t *testing.T, lsm *LSM, n int) { + // 用来跟踪调试的 + e := &utils.Entry{ + Key: []byte("CRTS😁硬核课堂MrGSBtL12345678"), + Value: []byte("我草了"), + ExpiresAt: 123, + } + //caseList := make([]*utils.Entry, 0) + //caseList = append(caseList, e) + + // 随机构建数据进行测试 + lsm.Set(e) + for i := 1; i < n; i++ { + ee := utils.BuildEntry() + lsm.Set(ee) + // caseList = append(caseList, ee) } + // 从levels中进行GET + v, err := lsm.Get(e.Key) + utils.Panic(err) + utils.CondPanic(!bytes.Equal(e.Value, v.Value), fmt.Errorf("lsm.Get(e.Key) value not equal !!!")) + // TODO range功能待完善 + //retList := make([]*utils.Entry, 0) + // testRange := func(isAsc bool) { + // // Range 确保写入进去的每个lsm都可以被读取到 + // iter := lsm.NewIterator(&utils.Options{IsAsc: true}) + // for iter.Rewind(); iter.Valid(); iter.Next() { + // e := iter.Item().Entry() + // retList = append(retList, e) + // } + // utils.CondPanic(len(retList) != len(caseList), fmt.Errorf("len(retList) != len(caseList)")) + // sort.Slice(retList, func(i, j int) bool { + // return utils.CompareKeys(retList[i].Key, retList[j].Key) > 1 + // }) + // for i := 0; i < len(caseList); i++ { + // a, b := caseList[i], retList[i] + // if !equal(a.Key, b.Key) || !equal(a.Value, b.Value) || a.ExpiresAt != b.ExpiresAt { + // utils.Panic(fmt.Errorf("lsm.Get(e.Key) kv disagreement !!!")) + // } + // } + // } + // // 测试升序 + // testRange(true) + // // 测试降序 + // testRange(false) } -// 对level管理器的性能测试 +// 驱动模块 +func buildLSM() *LSM { + // init DB Basic Test + c := make(chan map[uint32]int64, 16) + opt.DiscardStatsCh = &c + lsm := NewLSM(opt) + return lsm +} + +// 运行测试用例 +func runTest(n int, testFunList ...func()) { + for _, f := range testFunList { + for i := 0; i < n; i++ { + f() + } + } +} + +// 构建compactDef对象 +func buildCompactDef(lsm *LSM, id, thisLevel, nextLevel int) *compactDef { + t := targets{ + targetSz: []int64{0, 10485760, 10485760, 10485760, 10485760, 10485760, 10485760}, + fileSz: []int64{1024, 2097152, 2097152, 2097152, 2097152, 2097152, 2097152}, + baseLevel: nextLevel, + } + def := &compactDef{ + compactorId: id, + thisLevel: lsm.levels.levels[thisLevel], + nextLevel: lsm.levels.levels[nextLevel], + t: t, + p: buildCompactionPriority(lsm, thisLevel, t), + } + return def +} + +// 构建CompactionPriority对象 +func buildCompactionPriority(lsm *LSM, thisLevel int, t targets) compactionPriority { + return compactionPriority{ + level: thisLevel, + score: 8.6, + adjusted: 860, + t: t, + } +} + +func tricky(tables []*table) { + // 非常tricky的处理方法,为了能通过检查,检查所有逻辑分支 + for _, table := range tables { + table.ss.Indexs().StaleDataSize = 10 << 20 + t, _ := time.Parse("2006-01-02 15:04:05", "1995-08-10 00:00:00") + table.ss.SetCreatedAt(&t) + } +} +func clearDir() { + _, err := os.Stat(opt.WorkDir) + if err == nil { + os.RemoveAll(opt.WorkDir) + } + os.Mkdir(opt.WorkDir, os.ModePerm) +} diff --git a/lsm/memtable.go b/lsm/memtable.go index ee3fb51..3771caf 100644 --- a/lsm/memtable.go +++ b/lsm/memtable.go @@ -1,21 +1,57 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package lsm import ( + "bytes" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "sort" + "strconv" + "strings" + "sync/atomic" + "github.com/hardcore-os/corekv/file" "github.com/hardcore-os/corekv/utils" - "github.com/hardcore-os/corekv/utils/codec" + "github.com/pkg/errors" ) +const walFileExt string = ".wal" + // MemTable type memTable struct { - wal *file.WalFile - sl *utils.SkipList + lsm *LSM + wal *file.WalFile + sl *utils.Skiplist + buf *bytes.Buffer + maxVersion uint64 } -//todo: mock, need to add real logic -func NewMemtable() (*memTable, error) { - - return nil, nil +// NewMemtable _ +func (lsm *LSM) NewMemtable() *memTable { + newFid := atomic.AddUint64(&(lsm.levels.maxFID), 1) + fileOpt := &file.Options{ + Dir: lsm.option.WorkDir, + Flag: os.O_CREATE | os.O_RDWR, + MaxSz: int(lsm.option.MemTableSize), //TODO wal 要设置多大比较合理? 姑且跟sst一样大 + FID: newFid, + FileName: mtFilePath(lsm.option.WorkDir, newFid), + } + return &memTable{wal: file.OpenWalFile(fileOpt), sl: utils.NewSkiplist(int64(1 << 20)), lsm: lsm} } // Close @@ -23,36 +59,132 @@ func (m *memTable) close() error { if err := m.wal.Close(); err != nil { return err } - if err := m.sl.Close(); err != nil { - return err - } + return nil } -func (m *memTable) set(entry *codec.Entry) error { +func (m *memTable) set(entry *utils.Entry) error { // 写到wal 日志中,防止崩溃 if err := m.wal.Write(entry); err != nil { return err } // 写到memtable中 - if err := m.sl.Add(entry); err != nil { - return err - } + m.sl.Add(entry) return nil } -func (m *memTable) Get(key []byte) (*codec.Entry, error) { +func (m *memTable) Get(key []byte) (*utils.Entry, error) { // 索引检查当前的key是否在表中 O(1) 的时间复杂度 // 从内存表中获取数据 - return m.sl.Search(key), nil + vs := m.sl.Search(key) + + e := &utils.Entry{ + Key: key, + Value: vs.Value, + ExpiresAt: vs.ExpiresAt, + Meta: vs.Meta, + Version: vs.Version, + } + + return e, nil + } func (m *memTable) Size() int64 { - return m.sl.Size() + return m.sl.MemSize() } //recovery -func recovery(opt *Options) (*memTable, []*memTable) { - fileOpt := &file.Options{} - return &memTable{wal: file.OpenWalFile(fileOpt), sl: utils.NewSkipList()}, []*memTable{} +func (lsm *LSM) recovery() (*memTable, []*memTable) { + // 从 工作目录中获取所有文件 + files, err := ioutil.ReadDir(lsm.option.WorkDir) + if err != nil { + utils.Panic(err) + return nil, nil + } + var fids []uint64 + maxFid := lsm.levels.maxFID + // 识别 后缀为.wal的文件 + for _, file := range files { + if !strings.HasSuffix(file.Name(), walFileExt) { + continue + } + fsz := len(file.Name()) + fid, err := strconv.ParseUint(file.Name()[:fsz-len(walFileExt)], 10, 64) + // 考虑 wal文件的存在 更新maxFid + if maxFid < fid { + maxFid = fid + } + if err != nil { + utils.Panic(err) + return nil, nil + } + fids = append(fids, fid) + } + // 排序一下子 + sort.Slice(fids, func(i, j int) bool { + return fids[i] < fids[j] + }) + imms := []*memTable{} + // 遍历fid 做处理 + for _, fid := range fids { + mt, err := lsm.openMemTable(fid) + utils.CondPanic(err != nil, err) + if mt.sl.MemSize() == 0 { + // mt.DecrRef() + continue + } + // TODO 如果最后一个跳表没写满会怎么样?这不就浪费空间了吗 + imms = append(imms, mt) + } + // 更新最终的maxfid,初始化一定是串行执行的,因此不需要原子操作 + lsm.levels.maxFID = maxFid + return lsm.NewMemtable(), imms +} + +func (lsm *LSM) openMemTable(fid uint64) (*memTable, error) { + fileOpt := &file.Options{ + Dir: lsm.option.WorkDir, + Flag: os.O_CREATE | os.O_RDWR, + MaxSz: int(lsm.option.MemTableSize), + FID: fid, + FileName: mtFilePath(lsm.option.WorkDir, fid), + } + s := utils.NewSkiplist(int64(1 << 20)) + mt := &memTable{ + sl: s, + buf: &bytes.Buffer{}, + lsm: lsm, + } + mt.wal = file.OpenWalFile(fileOpt) + err := mt.UpdateSkipList() + utils.CondPanic(err != nil, errors.WithMessage(err, "while updating skiplist")) + return mt, nil +} +func mtFilePath(dir string, fid uint64) string { + return filepath.Join(dir, fmt.Sprintf("%05d%s", fid, walFileExt)) +} + +func (m *memTable) UpdateSkipList() error { + if m.wal == nil || m.sl == nil { + return nil + } + endOff, err := m.wal.Iterate(true, 0, m.replayFunction(m.lsm.option)) + if err != nil { + return errors.WithMessage(err, fmt.Sprintf("while iterating wal: %s", m.wal.Name())) + } + // if endOff < m.wal.Size() { + // return errors.WithMessage(utils.ErrTruncate, fmt.Sprintf("end offset: %d < size: %d", endOff, m.wal.Size())) + // } + return m.wal.Truncate(int64(endOff)) +} + +func (m *memTable) replayFunction(opt *Options) func(*utils.Entry, *utils.ValuePtr) error { + return func(e *utils.Entry, _ *utils.ValuePtr) error { // Function for replaying. + if ts := utils.ParseTs(e.Key); ts > m.maxVersion { + m.maxVersion = ts + } + m.sl.Add(e) + return nil + } } diff --git a/lsm/merge.go b/lsm/merge.go deleted file mode 100644 index 4b01a4e..0000000 --- a/lsm/merge.go +++ /dev/null @@ -1 +0,0 @@ -package lsm diff --git a/lsm/table.go b/lsm/table.go index c5b1d02..b2f26f3 100644 --- a/lsm/table.go +++ b/lsm/table.go @@ -1,15 +1,399 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package lsm -import "github.com/hardcore-os/corekv/file" +import ( + "encoding/binary" + "fmt" + "io" + "math" + "os" + "sort" + "strings" + "sync/atomic" + "time" + + "github.com/hardcore-os/corekv/file" + "github.com/hardcore-os/corekv/pb" + "github.com/hardcore-os/corekv/utils" + "github.com/pkg/errors" +) type table struct { - ss *file.SSTable - idxs []byte + ss *file.SSTable + lm *levelManager + fid uint64 + ref int32 // For file garbage collection. Atomic. } -func openTable(opt *Options, tableName string) *table { - t := &table{ss: file.OpenSStable(&file.Options{Name: tableName, Dir: opt.WorkDir})} - // 加载ss文件 索引 - t.idxs = t.ss.Indexs() +func openTable(lm *levelManager, tableName string, builder *tableBuilder) *table { + sstSize := int(lm.opt.SSTableMaxSz) + if builder != nil { + sstSize = int(builder.done().size) + } + var ( + t *table + err error + ) + fid := utils.FID(tableName) + // 对builder存在的情况 把buf flush到磁盘 + if builder != nil { + if t, err = builder.flush(lm, tableName); err != nil { + utils.Err(err) + return nil + } + } else { + t = &table{lm: lm, fid: fid} + // 如果没有builder 则创打开一个已经存在的sst文件 + t.ss = file.OpenSStable(&file.Options{ + FileName: tableName, + Dir: lm.opt.WorkDir, + Flag: os.O_CREATE | os.O_RDWR, + MaxSz: int(sstSize)}) + } + // 先要引用一下,否则后面使用迭代器会导致引用状态错误 + t.IncrRef() + // 初始化sst文件,把index加载进来 + if err := t.ss.Init(); err != nil { + utils.Err(err) + return nil + } + + // 获取sst的最大key 需要使用迭代器 + itr := t.NewIterator(&utils.Options{}) // 默认是降序 + defer itr.Close() + // 定位到初始位置就是最大的key + itr.Rewind() + utils.CondPanic(!itr.Valid(), errors.Errorf("failed to read index, form maxKey")) + maxKey := itr.Item().Entry().Key + t.ss.SetMaxKey(maxKey) + return t } + +// Serach 从table中查找key +func (t *table) Serach(key []byte, maxVs *uint64) (entry *utils.Entry, err error) { + t.IncrRef() + defer t.DecrRef() + // 获取索引 + idx := t.ss.Indexs() + // 检查key是否存在 + bloomFilter := utils.Filter(idx.BloomFilter) + if t.ss.HasBloomFilter() && !bloomFilter.MayContainKey(key) { + return nil, utils.ErrKeyNotFound + } + iter := t.NewIterator(&utils.Options{}) + defer iter.Close() + + iter.Seek(key) + if !iter.Valid() { + return nil, utils.ErrKeyNotFound + } + + if utils.SameKey(key, iter.Item().Entry().Key) { + if version := utils.ParseTs(iter.Item().Entry().Key); *maxVs < version { + *maxVs = version + return iter.Item().Entry(), nil + } + } + return nil, utils.ErrKeyNotFound +} + +func (t *table) indexKey() uint64 { + return t.fid +} +func (t *table) getEntry(key, block []byte, idx int) (entry *utils.Entry, err error) { + if len(block) == 0 { + return nil, utils.ErrKeyNotFound + } + dataStr := string(block) + blocks := strings.Split(dataStr, ",") + if idx >= 0 && idx < len(blocks) { + return &utils.Entry{ + Key: key, + Value: []byte(blocks[idx]), + }, nil + } + return nil, utils.ErrKeyNotFound +} + +// 去加载sst对应的block +func (t *table) block(idx int) (*block, error) { + utils.CondPanic(idx < 0, fmt.Errorf("idx=%d", idx)) + if idx >= len(t.ss.Indexs().Offsets) { + return nil, errors.New("block out of index") + } + var b *block + key := t.blockCacheKey(idx) + blk, ok := t.lm.cache.blocks.Get(key) + if ok && blk != nil { + b, _ = blk.(*block) + return b, nil + } + + var ko pb.BlockOffset + utils.CondPanic(!t.offsets(&ko, idx), fmt.Errorf("block t.offset id=%d", idx)) + b = &block{ + offset: int(ko.GetOffset()), + } + + var err error + if b.data, err = t.read(b.offset, int(ko.GetLen())); err != nil { + return nil, errors.Wrapf(err, + "failed to read from sstable: %d at offset: %d, len: %d", + t.ss.FID(), b.offset, ko.GetLen()) + } + + readPos := len(b.data) - 4 // First read checksum length. + b.chkLen = int(utils.BytesToU32(b.data[readPos : readPos+4])) + + if b.chkLen > len(b.data) { + return nil, errors.New("invalid checksum length. Either the data is " + + "corrupted or the table options are incorrectly set") + } + + readPos -= b.chkLen + b.checksum = b.data[readPos : readPos+b.chkLen] + + readPos -= 4 + numEntries := int(utils.BytesToU32(b.data[readPos : readPos+4])) + entriesIndexStart := readPos - (numEntries * 4) + entriesIndexEnd := entriesIndexStart + numEntries*4 + + b.entryOffsets = utils.BytesToU32Slice(b.data[entriesIndexStart:entriesIndexEnd]) + + b.entriesIndexStart = entriesIndexStart + + b.data = b.data[:readPos+4] + + if err = b.verifyCheckSum(); err != nil { + return nil, err + } + + t.lm.cache.blocks.Set(key, b) + + return b, nil +} + +func (t *table) read(off, sz int) ([]byte, error) { + return t.ss.Bytes(off, sz) +} + +// blockCacheKey is used to store blocks in the block cache. +func (t *table) blockCacheKey(idx int) []byte { + utils.CondPanic(t.fid >= math.MaxUint32, fmt.Errorf("t.fid >= math.MaxUint32")) + utils.CondPanic(uint32(idx) >= math.MaxUint32, fmt.Errorf("uint32(idx) >= math.MaxUint32")) + + buf := make([]byte, 8) + // Assume t.ID does not overflow uint32. + binary.BigEndian.PutUint32(buf[:4], uint32(t.fid)) + binary.BigEndian.PutUint32(buf[4:], uint32(idx)) + return buf +} + +type tableIterator struct { + it utils.Item + opt *utils.Options + t *table + blockPos int + bi *blockIterator + err error +} + +func (t *table) NewIterator(options *utils.Options) utils.Iterator { + t.IncrRef() + return &tableIterator{ + opt: options, + t: t, + bi: &blockIterator{}, + } +} +func (it *tableIterator) Next() { + it.err = nil + + if it.blockPos >= len(it.t.ss.Indexs().GetOffsets()) { + it.err = io.EOF + return + } + + if len(it.bi.data) == 0 { + block, err := it.t.block(it.blockPos) + if err != nil { + it.err = err + return + } + it.bi.tableID = it.t.fid + it.bi.blockID = it.blockPos + it.bi.setBlock(block) + it.bi.seekToFirst() + it.err = it.bi.Error() + return + } + + it.bi.Next() + if !it.bi.Valid() { + it.blockPos++ + it.bi.data = nil + it.Next() + return + } + it.it = it.bi.it +} +func (it *tableIterator) Valid() bool { + return it.err != io.EOF // 如果没有的时候 则是EOF +} +func (it *tableIterator) Rewind() { + if it.opt.IsAsc { + it.seekToFirst() + } else { + it.seekToLast() + } +} +func (it *tableIterator) Item() utils.Item { + return it.it +} +func (it *tableIterator) Close() error { + it.bi.Close() + return it.t.DecrRef() +} +func (it *tableIterator) seekToFirst() { + numBlocks := len(it.t.ss.Indexs().Offsets) + if numBlocks == 0 { + it.err = io.EOF + return + } + it.blockPos = 0 + block, err := it.t.block(it.blockPos) + if err != nil { + it.err = err + return + } + it.bi.tableID = it.t.fid + it.bi.blockID = it.blockPos + it.bi.setBlock(block) + it.bi.seekToFirst() + it.it = it.bi.Item() + it.err = it.bi.Error() +} + +func (it *tableIterator) seekToLast() { + numBlocks := len(it.t.ss.Indexs().Offsets) + if numBlocks == 0 { + it.err = io.EOF + return + } + it.blockPos = numBlocks - 1 + block, err := it.t.block(it.blockPos) + if err != nil { + it.err = err + return + } + it.bi.tableID = it.t.fid + it.bi.blockID = it.blockPos + it.bi.setBlock(block) + it.bi.seekToLast() + it.it = it.bi.Item() + it.err = it.bi.Error() +} + +// Seek +// 二分法搜索 offsets +// 如果idx == 0 说明key只能在第一个block中 block[0].MinKey <= key +// 否则 block[0].MinKey > key +// 如果在 idx-1 的block中未找到key 那才可能在 idx 中 +// 如果都没有,则当前key不再此table +func (it *tableIterator) Seek(key []byte) { + var ko pb.BlockOffset + idx := sort.Search(len(it.t.ss.Indexs().GetOffsets()), func(idx int) bool { + utils.CondPanic(!it.t.offsets(&ko, idx), fmt.Errorf("tableutils.Seek idx < 0 || idx > len(index.GetOffsets()")) + return utils.CompareKeys(ko.GetKey(), key) > 0 + }) + if idx == 0 { + it.seekHelper(0, key) + return + } + it.seekHelper(idx-1, key) + if it.err == io.EOF { + if idx == len(it.t.ss.Indexs().Offsets) { + return + } + it.seekHelper(idx, key) + } +} + +func (it *tableIterator) seekHelper(blockIdx int, key []byte) { + it.blockPos = blockIdx + block, err := it.t.block(blockIdx) + if err != nil { + it.err = err + return + } + it.bi.tableID = it.t.fid + it.bi.blockID = it.blockPos + it.bi.setBlock(block) + it.bi.seek(key) + it.err = it.bi.Error() + it.it = it.bi.Item() +} + +func (t *table) offsets(ko *pb.BlockOffset, i int) bool { + index := t.ss.Indexs() + if i < 0 || i > len(index.GetOffsets()) { + return false + } + *ko = *index.GetOffsets()[i] + return true +} + +// Size is its file size in bytes +func (t *table) Size() int64 { return int64(t.ss.Size()) } + +// GetCreatedAt +func (t *table) GetCreatedAt() *time.Time { + return t.ss.GetCreatedAt() +} +func (t *table) Delete() error { + return t.ss.Detele() +} + +// StaleDataSize is the amount of stale data (that can be dropped by a compaction )in this SST. +func (t *table) StaleDataSize() uint32 { return t.ss.Indexs().StaleDataSize } + +// DecrRef decrements the refcount and possibly deletes the table +func (t *table) DecrRef() error { + newRef := atomic.AddInt32(&t.ref, -1) + if newRef == 0 { + // TODO 从缓存中删除 + for i := 0; i < len(t.ss.Indexs().GetOffsets()); i++ { + t.lm.cache.blocks.Del(t.blockCacheKey(i)) + } + if err := t.Delete(); err != nil { + return err + } + } + return nil +} + +func (t *table) IncrRef() { + atomic.AddInt32(&t.ref, 1) +} +func decrRefs(tables []*table) error { + for _, table := range tables { + if err := table.DecrRef(); err != nil { + return err + } + } + return nil +} diff --git a/options.go b/options.go index 279595d..aed8994 100644 --- a/options.go +++ b/options.go @@ -1,17 +1,43 @@ +// Copyright 2021 hardcore-o Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package corekv import "github.com/hardcore-os/corekv/utils" // Options corekv 总的配置文件 type Options struct { - ValueThreshold int64 - WorkDir string - MemTableSize int64 + ValueThreshold int64 + WorkDir string + MemTableSize int64 + SSTableMaxSz int64 + MaxBatchCount int64 + MaxBatchSize int64 // max batch size in bytes + ValueLogFileSize int + VerifyValueChecksum bool + ValueLogMaxEntries uint32 + LogRotatesToFlush int32 + MaxTableSize int64 } // NewDefaultOptions 返回默认的options func NewDefaultOptions() *Options { - opt := &Options{} + opt := &Options{ + WorkDir: "./work_test", + MemTableSize: 1024, + SSTableMaxSz: 1 << 30, + } opt.ValueThreshold = utils.DefaultValueThreshold return opt } diff --git a/pb/pb.pb.go b/pb/pb.pb.go new file mode 100644 index 0000000..1e6e150 --- /dev/null +++ b/pb/pb.pb.go @@ -0,0 +1,1915 @@ +// Code generated by protoc-gen-gogo. DO NOT EDIT. +// source: pb.proto + +package pb + +import ( + fmt "fmt" + proto "github.com/golang/protobuf/proto" + io "io" + math "math" + math_bits "math/bits" +) + +// Reference imports to suppress errors if they are not otherwise used. +var _ = proto.Marshal +var _ = fmt.Errorf +var _ = math.Inf + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the proto package it is being compiled against. +// A compilation error at this line likely means your copy of the +// proto package needs to be updated. +const _ = proto.ProtoPackageIsVersion3 // please upgrade the proto package + +type ManifestChange_Operation int32 + +const ( + ManifestChange_CREATE ManifestChange_Operation = 0 + ManifestChange_DELETE ManifestChange_Operation = 1 +) + +var ManifestChange_Operation_name = map[int32]string{ + 0: "CREATE", + 1: "DELETE", +} + +var ManifestChange_Operation_value = map[string]int32{ + "CREATE": 0, + "DELETE": 1, +} + +func (x ManifestChange_Operation) String() string { + return proto.EnumName(ManifestChange_Operation_name, int32(x)) +} + +func (ManifestChange_Operation) EnumDescriptor() ([]byte, []int) { + return fileDescriptor_f80abaa17e25ccc8, []int{3, 0} +} + +type KV struct { + Key []byte `protobuf:"bytes,1,opt,name=key,proto3" json:"key,omitempty"` + Value []byte `protobuf:"bytes,2,opt,name=value,proto3" json:"value,omitempty"` + UserMeta []byte `protobuf:"bytes,3,opt,name=user_meta,json=userMeta,proto3" json:"user_meta,omitempty"` + Version uint64 `protobuf:"varint,4,opt,name=version,proto3" json:"version,omitempty"` + ExpiresAt uint64 `protobuf:"varint,5,opt,name=expires_at,json=expiresAt,proto3" json:"expires_at,omitempty"` + Meta []byte `protobuf:"bytes,6,opt,name=meta,proto3" json:"meta,omitempty"` + // Stream id is used to identify which stream the KV came from. + StreamId uint32 `protobuf:"varint,10,opt,name=stream_id,json=streamId,proto3" json:"stream_id,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *KV) Reset() { *m = KV{} } +func (m *KV) String() string { return proto.CompactTextString(m) } +func (*KV) ProtoMessage() {} +func (*KV) Descriptor() ([]byte, []int) { + return fileDescriptor_f80abaa17e25ccc8, []int{0} +} +func (m *KV) XXX_Unmarshal(b []byte) error { + return m.Unmarshal(b) +} +func (m *KV) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + if deterministic { + return xxx_messageInfo_KV.Marshal(b, m, deterministic) + } else { + b = b[:cap(b)] + n, err := m.MarshalToSizedBuffer(b) + if err != nil { + return nil, err + } + return b[:n], nil + } +} +func (m *KV) XXX_Merge(src proto.Message) { + xxx_messageInfo_KV.Merge(m, src) +} +func (m *KV) XXX_Size() int { + return m.Size() +} +func (m *KV) XXX_DiscardUnknown() { + xxx_messageInfo_KV.DiscardUnknown(m) +} + +var xxx_messageInfo_KV proto.InternalMessageInfo + +func (m *KV) GetKey() []byte { + if m != nil { + return m.Key + } + return nil +} + +func (m *KV) GetValue() []byte { + if m != nil { + return m.Value + } + return nil +} + +func (m *KV) GetUserMeta() []byte { + if m != nil { + return m.UserMeta + } + return nil +} + +func (m *KV) GetVersion() uint64 { + if m != nil { + return m.Version + } + return 0 +} + +func (m *KV) GetExpiresAt() uint64 { + if m != nil { + return m.ExpiresAt + } + return 0 +} + +func (m *KV) GetMeta() []byte { + if m != nil { + return m.Meta + } + return nil +} + +func (m *KV) GetStreamId() uint32 { + if m != nil { + return m.StreamId + } + return 0 +} + +type KVList struct { + Kv []*KV `protobuf:"bytes,1,rep,name=kv,proto3" json:"kv,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *KVList) Reset() { *m = KVList{} } +func (m *KVList) String() string { return proto.CompactTextString(m) } +func (*KVList) ProtoMessage() {} +func (*KVList) Descriptor() ([]byte, []int) { + return fileDescriptor_f80abaa17e25ccc8, []int{1} +} +func (m *KVList) XXX_Unmarshal(b []byte) error { + return m.Unmarshal(b) +} +func (m *KVList) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + if deterministic { + return xxx_messageInfo_KVList.Marshal(b, m, deterministic) + } else { + b = b[:cap(b)] + n, err := m.MarshalToSizedBuffer(b) + if err != nil { + return nil, err + } + return b[:n], nil + } +} +func (m *KVList) XXX_Merge(src proto.Message) { + xxx_messageInfo_KVList.Merge(m, src) +} +func (m *KVList) XXX_Size() int { + return m.Size() +} +func (m *KVList) XXX_DiscardUnknown() { + xxx_messageInfo_KVList.DiscardUnknown(m) +} + +var xxx_messageInfo_KVList proto.InternalMessageInfo + +func (m *KVList) GetKv() []*KV { + if m != nil { + return m.Kv + } + return nil +} + +type ManifestChangeSet struct { + // A set of changes that are applied atomically. + Changes []*ManifestChange `protobuf:"bytes,1,rep,name=changes,proto3" json:"changes,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *ManifestChangeSet) Reset() { *m = ManifestChangeSet{} } +func (m *ManifestChangeSet) String() string { return proto.CompactTextString(m) } +func (*ManifestChangeSet) ProtoMessage() {} +func (*ManifestChangeSet) Descriptor() ([]byte, []int) { + return fileDescriptor_f80abaa17e25ccc8, []int{2} +} +func (m *ManifestChangeSet) XXX_Unmarshal(b []byte) error { + return m.Unmarshal(b) +} +func (m *ManifestChangeSet) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + if deterministic { + return xxx_messageInfo_ManifestChangeSet.Marshal(b, m, deterministic) + } else { + b = b[:cap(b)] + n, err := m.MarshalToSizedBuffer(b) + if err != nil { + return nil, err + } + return b[:n], nil + } +} +func (m *ManifestChangeSet) XXX_Merge(src proto.Message) { + xxx_messageInfo_ManifestChangeSet.Merge(m, src) +} +func (m *ManifestChangeSet) XXX_Size() int { + return m.Size() +} +func (m *ManifestChangeSet) XXX_DiscardUnknown() { + xxx_messageInfo_ManifestChangeSet.DiscardUnknown(m) +} + +var xxx_messageInfo_ManifestChangeSet proto.InternalMessageInfo + +func (m *ManifestChangeSet) GetChanges() []*ManifestChange { + if m != nil { + return m.Changes + } + return nil +} + +type ManifestChange struct { + Id uint64 `protobuf:"varint,1,opt,name=Id,proto3" json:"Id,omitempty"` + Op ManifestChange_Operation `protobuf:"varint,2,opt,name=Op,proto3,enum=pb.ManifestChange_Operation" json:"Op,omitempty"` + Level uint32 `protobuf:"varint,3,opt,name=Level,proto3" json:"Level,omitempty"` + Checksum []byte `protobuf:"bytes,4,opt,name=Checksum,proto3" json:"Checksum,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *ManifestChange) Reset() { *m = ManifestChange{} } +func (m *ManifestChange) String() string { return proto.CompactTextString(m) } +func (*ManifestChange) ProtoMessage() {} +func (*ManifestChange) Descriptor() ([]byte, []int) { + return fileDescriptor_f80abaa17e25ccc8, []int{3} +} +func (m *ManifestChange) XXX_Unmarshal(b []byte) error { + return m.Unmarshal(b) +} +func (m *ManifestChange) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + if deterministic { + return xxx_messageInfo_ManifestChange.Marshal(b, m, deterministic) + } else { + b = b[:cap(b)] + n, err := m.MarshalToSizedBuffer(b) + if err != nil { + return nil, err + } + return b[:n], nil + } +} +func (m *ManifestChange) XXX_Merge(src proto.Message) { + xxx_messageInfo_ManifestChange.Merge(m, src) +} +func (m *ManifestChange) XXX_Size() int { + return m.Size() +} +func (m *ManifestChange) XXX_DiscardUnknown() { + xxx_messageInfo_ManifestChange.DiscardUnknown(m) +} + +var xxx_messageInfo_ManifestChange proto.InternalMessageInfo + +func (m *ManifestChange) GetId() uint64 { + if m != nil { + return m.Id + } + return 0 +} + +func (m *ManifestChange) GetOp() ManifestChange_Operation { + if m != nil { + return m.Op + } + return ManifestChange_CREATE +} + +func (m *ManifestChange) GetLevel() uint32 { + if m != nil { + return m.Level + } + return 0 +} + +func (m *ManifestChange) GetChecksum() []byte { + if m != nil { + return m.Checksum + } + return nil +} + +type TableIndex struct { + Offsets []*BlockOffset `protobuf:"bytes,1,rep,name=offsets,proto3" json:"offsets,omitempty"` + BloomFilter []byte `protobuf:"bytes,2,opt,name=bloomFilter,proto3" json:"bloomFilter,omitempty"` + MaxVersion uint64 `protobuf:"varint,3,opt,name=maxVersion,proto3" json:"maxVersion,omitempty"` + KeyCount uint32 `protobuf:"varint,4,opt,name=keyCount,proto3" json:"keyCount,omitempty"` + StaleDataSize uint32 `protobuf:"varint,5,opt,name=staleDataSize,proto3" json:"staleDataSize,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *TableIndex) Reset() { *m = TableIndex{} } +func (m *TableIndex) String() string { return proto.CompactTextString(m) } +func (*TableIndex) ProtoMessage() {} +func (*TableIndex) Descriptor() ([]byte, []int) { + return fileDescriptor_f80abaa17e25ccc8, []int{4} +} +func (m *TableIndex) XXX_Unmarshal(b []byte) error { + return m.Unmarshal(b) +} +func (m *TableIndex) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + if deterministic { + return xxx_messageInfo_TableIndex.Marshal(b, m, deterministic) + } else { + b = b[:cap(b)] + n, err := m.MarshalToSizedBuffer(b) + if err != nil { + return nil, err + } + return b[:n], nil + } +} +func (m *TableIndex) XXX_Merge(src proto.Message) { + xxx_messageInfo_TableIndex.Merge(m, src) +} +func (m *TableIndex) XXX_Size() int { + return m.Size() +} +func (m *TableIndex) XXX_DiscardUnknown() { + xxx_messageInfo_TableIndex.DiscardUnknown(m) +} + +var xxx_messageInfo_TableIndex proto.InternalMessageInfo + +func (m *TableIndex) GetOffsets() []*BlockOffset { + if m != nil { + return m.Offsets + } + return nil +} + +func (m *TableIndex) GetBloomFilter() []byte { + if m != nil { + return m.BloomFilter + } + return nil +} + +func (m *TableIndex) GetMaxVersion() uint64 { + if m != nil { + return m.MaxVersion + } + return 0 +} + +func (m *TableIndex) GetKeyCount() uint32 { + if m != nil { + return m.KeyCount + } + return 0 +} + +func (m *TableIndex) GetStaleDataSize() uint32 { + if m != nil { + return m.StaleDataSize + } + return 0 +} + +type BlockOffset struct { + Key []byte `protobuf:"bytes,1,opt,name=key,proto3" json:"key,omitempty"` + Offset uint32 `protobuf:"varint,2,opt,name=offset,proto3" json:"offset,omitempty"` + Len uint32 `protobuf:"varint,3,opt,name=len,proto3" json:"len,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *BlockOffset) Reset() { *m = BlockOffset{} } +func (m *BlockOffset) String() string { return proto.CompactTextString(m) } +func (*BlockOffset) ProtoMessage() {} +func (*BlockOffset) Descriptor() ([]byte, []int) { + return fileDescriptor_f80abaa17e25ccc8, []int{5} +} +func (m *BlockOffset) XXX_Unmarshal(b []byte) error { + return m.Unmarshal(b) +} +func (m *BlockOffset) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + if deterministic { + return xxx_messageInfo_BlockOffset.Marshal(b, m, deterministic) + } else { + b = b[:cap(b)] + n, err := m.MarshalToSizedBuffer(b) + if err != nil { + return nil, err + } + return b[:n], nil + } +} +func (m *BlockOffset) XXX_Merge(src proto.Message) { + xxx_messageInfo_BlockOffset.Merge(m, src) +} +func (m *BlockOffset) XXX_Size() int { + return m.Size() +} +func (m *BlockOffset) XXX_DiscardUnknown() { + xxx_messageInfo_BlockOffset.DiscardUnknown(m) +} + +var xxx_messageInfo_BlockOffset proto.InternalMessageInfo + +func (m *BlockOffset) GetKey() []byte { + if m != nil { + return m.Key + } + return nil +} + +func (m *BlockOffset) GetOffset() uint32 { + if m != nil { + return m.Offset + } + return 0 +} + +func (m *BlockOffset) GetLen() uint32 { + if m != nil { + return m.Len + } + return 0 +} + +func init() { + proto.RegisterEnum("pb.ManifestChange_Operation", ManifestChange_Operation_name, ManifestChange_Operation_value) + proto.RegisterType((*KV)(nil), "pb.KV") + proto.RegisterType((*KVList)(nil), "pb.KVList") + proto.RegisterType((*ManifestChangeSet)(nil), "pb.ManifestChangeSet") + proto.RegisterType((*ManifestChange)(nil), "pb.ManifestChange") + proto.RegisterType((*TableIndex)(nil), "pb.TableIndex") + proto.RegisterType((*BlockOffset)(nil), "pb.BlockOffset") +} + +func init() { proto.RegisterFile("pb.proto", fileDescriptor_f80abaa17e25ccc8) } + +var fileDescriptor_f80abaa17e25ccc8 = []byte{ + // 485 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x6c, 0x52, 0x5d, 0x6e, 0xda, 0x4c, + 0x14, 0xcd, 0x18, 0x62, 0xe0, 0x12, 0xf3, 0xf1, 0x8d, 0xaa, 0xc8, 0xea, 0x0f, 0xb2, 0xdc, 0x3e, + 0x50, 0x29, 0xe2, 0x21, 0x5d, 0x01, 0x21, 0x54, 0xb2, 0x20, 0x42, 0x9a, 0x20, 0x5e, 0xd1, 0x18, + 0x5f, 0x1a, 0xcb, 0xbf, 0xf2, 0x0c, 0x16, 0xe9, 0x4a, 0xba, 0x81, 0xae, 0xa0, 0x8f, 0xdd, 0x40, + 0x1f, 0xbb, 0x84, 0x8a, 0x6e, 0xa4, 0x9a, 0xc1, 0x20, 0x50, 0xfb, 0x76, 0xcf, 0xb9, 0x77, 0xce, + 0x9c, 0x39, 0x77, 0xa0, 0x99, 0xfb, 0x83, 0xbc, 0xc8, 0x64, 0x46, 0x8d, 0xdc, 0x77, 0xbf, 0x11, + 0x30, 0x26, 0x0b, 0xda, 0x85, 0x5a, 0x84, 0xcf, 0x36, 0x71, 0x48, 0xff, 0x8a, 0xa9, 0x92, 0xbe, + 0x80, 0xcb, 0x92, 0xc7, 0x1b, 0xb4, 0x0d, 0xcd, 0xed, 0x01, 0x7d, 0x05, 0xad, 0x8d, 0xc0, 0x62, + 0x99, 0xa0, 0xe4, 0x76, 0x4d, 0x77, 0x9a, 0x8a, 0x78, 0x40, 0xc9, 0xa9, 0x0d, 0x8d, 0x12, 0x0b, + 0x11, 0x66, 0xa9, 0x5d, 0x77, 0x48, 0xbf, 0xce, 0x0e, 0x90, 0xbe, 0x01, 0xc0, 0x6d, 0x1e, 0x16, + 0x28, 0x96, 0x5c, 0xda, 0x97, 0xba, 0xd9, 0xaa, 0x98, 0xa1, 0xa4, 0x14, 0xea, 0x5a, 0xd0, 0xd4, + 0x82, 0xba, 0x56, 0x37, 0x09, 0x59, 0x20, 0x4f, 0x96, 0x61, 0x60, 0x83, 0x43, 0xfa, 0x16, 0x6b, + 0xee, 0x09, 0x2f, 0x70, 0x1d, 0x30, 0x27, 0x8b, 0x69, 0x28, 0x24, 0xbd, 0x06, 0x23, 0x2a, 0x6d, + 0xe2, 0xd4, 0xfa, 0xed, 0x5b, 0x73, 0x90, 0xfb, 0x83, 0xc9, 0x82, 0x19, 0x51, 0xe9, 0x0e, 0xe1, + 0xff, 0x07, 0x9e, 0x86, 0x6b, 0x14, 0x72, 0xf4, 0xc4, 0xd3, 0x4f, 0xf8, 0x88, 0x92, 0xde, 0x40, + 0x63, 0xa5, 0x81, 0xa8, 0x4e, 0x50, 0x75, 0xe2, 0x7c, 0x8e, 0x1d, 0x46, 0xdc, 0xaf, 0x04, 0x3a, + 0xe7, 0x3d, 0xda, 0x01, 0xc3, 0x0b, 0x74, 0x4a, 0x75, 0x66, 0x78, 0x01, 0xbd, 0x01, 0x63, 0x96, + 0xeb, 0x84, 0x3a, 0xb7, 0xaf, 0xff, 0xd6, 0x1a, 0xcc, 0x72, 0x2c, 0xb8, 0x0c, 0xb3, 0x94, 0x19, + 0xb3, 0x5c, 0x45, 0x3a, 0xc5, 0x12, 0x63, 0x1d, 0x9c, 0xc5, 0xf6, 0x80, 0xbe, 0x84, 0xe6, 0xe8, + 0x09, 0x57, 0x91, 0xd8, 0x24, 0x3a, 0xb6, 0x2b, 0x76, 0xc4, 0xee, 0x5b, 0x68, 0x1d, 0x25, 0x28, + 0x80, 0x39, 0x62, 0xe3, 0xe1, 0x7c, 0xdc, 0xbd, 0x50, 0xf5, 0xfd, 0x78, 0x3a, 0x9e, 0x8f, 0xbb, + 0xc4, 0xfd, 0x4e, 0x00, 0xe6, 0xdc, 0x8f, 0xd1, 0x4b, 0x03, 0xdc, 0xd2, 0xf7, 0xd0, 0xc8, 0xd6, + 0x6b, 0x81, 0xf2, 0xf0, 0xc8, 0xff, 0x94, 0xb1, 0xbb, 0x38, 0x5b, 0x45, 0x33, 0xcd, 0xb3, 0x43, + 0x9f, 0x3a, 0xd0, 0xf6, 0xe3, 0x2c, 0x4b, 0x3e, 0x86, 0xb1, 0xc4, 0xa2, 0xda, 0xf4, 0x29, 0x45, + 0x7b, 0x00, 0x09, 0xdf, 0x2e, 0xaa, 0xad, 0xd6, 0xf4, 0xc3, 0x4f, 0x18, 0x65, 0x3e, 0xc2, 0xe7, + 0x51, 0xb6, 0x49, 0xa5, 0x36, 0x6f, 0xb1, 0x23, 0xa6, 0xef, 0xc0, 0x12, 0x92, 0xc7, 0x78, 0xcf, + 0x25, 0x7f, 0x0c, 0x3f, 0xa3, 0xde, 0xbb, 0xc5, 0xce, 0x49, 0xd7, 0x83, 0xf6, 0x89, 0xb7, 0x7f, + 0x7c, 0xc4, 0x6b, 0x30, 0xf7, 0x7e, 0xb5, 0x3f, 0x8b, 0x55, 0x48, 0x4d, 0xc6, 0x98, 0x56, 0x59, + 0xaa, 0xf2, 0xae, 0xfb, 0x63, 0xd7, 0x23, 0x3f, 0x77, 0x3d, 0xf2, 0x6b, 0xd7, 0x23, 0x5f, 0x7e, + 0xf7, 0x2e, 0x7c, 0x53, 0x7f, 0xf4, 0x0f, 0x7f, 0x02, 0x00, 0x00, 0xff, 0xff, 0x9e, 0xf4, 0xb3, + 0x68, 0xf4, 0x02, 0x00, 0x00, +} + +func (m *KV) Marshal() (dAtA []byte, err error) { + size := m.Size() + dAtA = make([]byte, size) + n, err := m.MarshalToSizedBuffer(dAtA[:size]) + if err != nil { + return nil, err + } + return dAtA[:n], nil +} + +func (m *KV) MarshalTo(dAtA []byte) (int, error) { + size := m.Size() + return m.MarshalToSizedBuffer(dAtA[:size]) +} + +func (m *KV) MarshalToSizedBuffer(dAtA []byte) (int, error) { + i := len(dAtA) + _ = i + var l int + _ = l + if m.XXX_unrecognized != nil { + i -= len(m.XXX_unrecognized) + copy(dAtA[i:], m.XXX_unrecognized) + } + if m.StreamId != 0 { + i = encodeVarintPb(dAtA, i, uint64(m.StreamId)) + i-- + dAtA[i] = 0x50 + } + if len(m.Meta) > 0 { + i -= len(m.Meta) + copy(dAtA[i:], m.Meta) + i = encodeVarintPb(dAtA, i, uint64(len(m.Meta))) + i-- + dAtA[i] = 0x32 + } + if m.ExpiresAt != 0 { + i = encodeVarintPb(dAtA, i, uint64(m.ExpiresAt)) + i-- + dAtA[i] = 0x28 + } + if m.Version != 0 { + i = encodeVarintPb(dAtA, i, uint64(m.Version)) + i-- + dAtA[i] = 0x20 + } + if len(m.UserMeta) > 0 { + i -= len(m.UserMeta) + copy(dAtA[i:], m.UserMeta) + i = encodeVarintPb(dAtA, i, uint64(len(m.UserMeta))) + i-- + dAtA[i] = 0x1a + } + if len(m.Value) > 0 { + i -= len(m.Value) + copy(dAtA[i:], m.Value) + i = encodeVarintPb(dAtA, i, uint64(len(m.Value))) + i-- + dAtA[i] = 0x12 + } + if len(m.Key) > 0 { + i -= len(m.Key) + copy(dAtA[i:], m.Key) + i = encodeVarintPb(dAtA, i, uint64(len(m.Key))) + i-- + dAtA[i] = 0xa + } + return len(dAtA) - i, nil +} + +func (m *KVList) Marshal() (dAtA []byte, err error) { + size := m.Size() + dAtA = make([]byte, size) + n, err := m.MarshalToSizedBuffer(dAtA[:size]) + if err != nil { + return nil, err + } + return dAtA[:n], nil +} + +func (m *KVList) MarshalTo(dAtA []byte) (int, error) { + size := m.Size() + return m.MarshalToSizedBuffer(dAtA[:size]) +} + +func (m *KVList) MarshalToSizedBuffer(dAtA []byte) (int, error) { + i := len(dAtA) + _ = i + var l int + _ = l + if m.XXX_unrecognized != nil { + i -= len(m.XXX_unrecognized) + copy(dAtA[i:], m.XXX_unrecognized) + } + if len(m.Kv) > 0 { + for iNdEx := len(m.Kv) - 1; iNdEx >= 0; iNdEx-- { + { + size, err := m.Kv[iNdEx].MarshalToSizedBuffer(dAtA[:i]) + if err != nil { + return 0, err + } + i -= size + i = encodeVarintPb(dAtA, i, uint64(size)) + } + i-- + dAtA[i] = 0xa + } + } + return len(dAtA) - i, nil +} + +func (m *ManifestChangeSet) Marshal() (dAtA []byte, err error) { + size := m.Size() + dAtA = make([]byte, size) + n, err := m.MarshalToSizedBuffer(dAtA[:size]) + if err != nil { + return nil, err + } + return dAtA[:n], nil +} + +func (m *ManifestChangeSet) MarshalTo(dAtA []byte) (int, error) { + size := m.Size() + return m.MarshalToSizedBuffer(dAtA[:size]) +} + +func (m *ManifestChangeSet) MarshalToSizedBuffer(dAtA []byte) (int, error) { + i := len(dAtA) + _ = i + var l int + _ = l + if m.XXX_unrecognized != nil { + i -= len(m.XXX_unrecognized) + copy(dAtA[i:], m.XXX_unrecognized) + } + if len(m.Changes) > 0 { + for iNdEx := len(m.Changes) - 1; iNdEx >= 0; iNdEx-- { + { + size, err := m.Changes[iNdEx].MarshalToSizedBuffer(dAtA[:i]) + if err != nil { + return 0, err + } + i -= size + i = encodeVarintPb(dAtA, i, uint64(size)) + } + i-- + dAtA[i] = 0xa + } + } + return len(dAtA) - i, nil +} + +func (m *ManifestChange) Marshal() (dAtA []byte, err error) { + size := m.Size() + dAtA = make([]byte, size) + n, err := m.MarshalToSizedBuffer(dAtA[:size]) + if err != nil { + return nil, err + } + return dAtA[:n], nil +} + +func (m *ManifestChange) MarshalTo(dAtA []byte) (int, error) { + size := m.Size() + return m.MarshalToSizedBuffer(dAtA[:size]) +} + +func (m *ManifestChange) MarshalToSizedBuffer(dAtA []byte) (int, error) { + i := len(dAtA) + _ = i + var l int + _ = l + if m.XXX_unrecognized != nil { + i -= len(m.XXX_unrecognized) + copy(dAtA[i:], m.XXX_unrecognized) + } + if len(m.Checksum) > 0 { + i -= len(m.Checksum) + copy(dAtA[i:], m.Checksum) + i = encodeVarintPb(dAtA, i, uint64(len(m.Checksum))) + i-- + dAtA[i] = 0x22 + } + if m.Level != 0 { + i = encodeVarintPb(dAtA, i, uint64(m.Level)) + i-- + dAtA[i] = 0x18 + } + if m.Op != 0 { + i = encodeVarintPb(dAtA, i, uint64(m.Op)) + i-- + dAtA[i] = 0x10 + } + if m.Id != 0 { + i = encodeVarintPb(dAtA, i, uint64(m.Id)) + i-- + dAtA[i] = 0x8 + } + return len(dAtA) - i, nil +} + +func (m *TableIndex) Marshal() (dAtA []byte, err error) { + size := m.Size() + dAtA = make([]byte, size) + n, err := m.MarshalToSizedBuffer(dAtA[:size]) + if err != nil { + return nil, err + } + return dAtA[:n], nil +} + +func (m *TableIndex) MarshalTo(dAtA []byte) (int, error) { + size := m.Size() + return m.MarshalToSizedBuffer(dAtA[:size]) +} + +func (m *TableIndex) MarshalToSizedBuffer(dAtA []byte) (int, error) { + i := len(dAtA) + _ = i + var l int + _ = l + if m.XXX_unrecognized != nil { + i -= len(m.XXX_unrecognized) + copy(dAtA[i:], m.XXX_unrecognized) + } + if m.StaleDataSize != 0 { + i = encodeVarintPb(dAtA, i, uint64(m.StaleDataSize)) + i-- + dAtA[i] = 0x28 + } + if m.KeyCount != 0 { + i = encodeVarintPb(dAtA, i, uint64(m.KeyCount)) + i-- + dAtA[i] = 0x20 + } + if m.MaxVersion != 0 { + i = encodeVarintPb(dAtA, i, uint64(m.MaxVersion)) + i-- + dAtA[i] = 0x18 + } + if len(m.BloomFilter) > 0 { + i -= len(m.BloomFilter) + copy(dAtA[i:], m.BloomFilter) + i = encodeVarintPb(dAtA, i, uint64(len(m.BloomFilter))) + i-- + dAtA[i] = 0x12 + } + if len(m.Offsets) > 0 { + for iNdEx := len(m.Offsets) - 1; iNdEx >= 0; iNdEx-- { + { + size, err := m.Offsets[iNdEx].MarshalToSizedBuffer(dAtA[:i]) + if err != nil { + return 0, err + } + i -= size + i = encodeVarintPb(dAtA, i, uint64(size)) + } + i-- + dAtA[i] = 0xa + } + } + return len(dAtA) - i, nil +} + +func (m *BlockOffset) Marshal() (dAtA []byte, err error) { + size := m.Size() + dAtA = make([]byte, size) + n, err := m.MarshalToSizedBuffer(dAtA[:size]) + if err != nil { + return nil, err + } + return dAtA[:n], nil +} + +func (m *BlockOffset) MarshalTo(dAtA []byte) (int, error) { + size := m.Size() + return m.MarshalToSizedBuffer(dAtA[:size]) +} + +func (m *BlockOffset) MarshalToSizedBuffer(dAtA []byte) (int, error) { + i := len(dAtA) + _ = i + var l int + _ = l + if m.XXX_unrecognized != nil { + i -= len(m.XXX_unrecognized) + copy(dAtA[i:], m.XXX_unrecognized) + } + if m.Len != 0 { + i = encodeVarintPb(dAtA, i, uint64(m.Len)) + i-- + dAtA[i] = 0x18 + } + if m.Offset != 0 { + i = encodeVarintPb(dAtA, i, uint64(m.Offset)) + i-- + dAtA[i] = 0x10 + } + if len(m.Key) > 0 { + i -= len(m.Key) + copy(dAtA[i:], m.Key) + i = encodeVarintPb(dAtA, i, uint64(len(m.Key))) + i-- + dAtA[i] = 0xa + } + return len(dAtA) - i, nil +} + +func encodeVarintPb(dAtA []byte, offset int, v uint64) int { + offset -= sovPb(v) + base := offset + for v >= 1<<7 { + dAtA[offset] = uint8(v&0x7f | 0x80) + v >>= 7 + offset++ + } + dAtA[offset] = uint8(v) + return base +} +func (m *KV) Size() (n int) { + if m == nil { + return 0 + } + var l int + _ = l + l = len(m.Key) + if l > 0 { + n += 1 + l + sovPb(uint64(l)) + } + l = len(m.Value) + if l > 0 { + n += 1 + l + sovPb(uint64(l)) + } + l = len(m.UserMeta) + if l > 0 { + n += 1 + l + sovPb(uint64(l)) + } + if m.Version != 0 { + n += 1 + sovPb(uint64(m.Version)) + } + if m.ExpiresAt != 0 { + n += 1 + sovPb(uint64(m.ExpiresAt)) + } + l = len(m.Meta) + if l > 0 { + n += 1 + l + sovPb(uint64(l)) + } + if m.StreamId != 0 { + n += 1 + sovPb(uint64(m.StreamId)) + } + if m.XXX_unrecognized != nil { + n += len(m.XXX_unrecognized) + } + return n +} + +func (m *KVList) Size() (n int) { + if m == nil { + return 0 + } + var l int + _ = l + if len(m.Kv) > 0 { + for _, e := range m.Kv { + l = e.Size() + n += 1 + l + sovPb(uint64(l)) + } + } + if m.XXX_unrecognized != nil { + n += len(m.XXX_unrecognized) + } + return n +} + +func (m *ManifestChangeSet) Size() (n int) { + if m == nil { + return 0 + } + var l int + _ = l + if len(m.Changes) > 0 { + for _, e := range m.Changes { + l = e.Size() + n += 1 + l + sovPb(uint64(l)) + } + } + if m.XXX_unrecognized != nil { + n += len(m.XXX_unrecognized) + } + return n +} + +func (m *ManifestChange) Size() (n int) { + if m == nil { + return 0 + } + var l int + _ = l + if m.Id != 0 { + n += 1 + sovPb(uint64(m.Id)) + } + if m.Op != 0 { + n += 1 + sovPb(uint64(m.Op)) + } + if m.Level != 0 { + n += 1 + sovPb(uint64(m.Level)) + } + l = len(m.Checksum) + if l > 0 { + n += 1 + l + sovPb(uint64(l)) + } + if m.XXX_unrecognized != nil { + n += len(m.XXX_unrecognized) + } + return n +} + +func (m *TableIndex) Size() (n int) { + if m == nil { + return 0 + } + var l int + _ = l + if len(m.Offsets) > 0 { + for _, e := range m.Offsets { + l = e.Size() + n += 1 + l + sovPb(uint64(l)) + } + } + l = len(m.BloomFilter) + if l > 0 { + n += 1 + l + sovPb(uint64(l)) + } + if m.MaxVersion != 0 { + n += 1 + sovPb(uint64(m.MaxVersion)) + } + if m.KeyCount != 0 { + n += 1 + sovPb(uint64(m.KeyCount)) + } + if m.StaleDataSize != 0 { + n += 1 + sovPb(uint64(m.StaleDataSize)) + } + if m.XXX_unrecognized != nil { + n += len(m.XXX_unrecognized) + } + return n +} + +func (m *BlockOffset) Size() (n int) { + if m == nil { + return 0 + } + var l int + _ = l + l = len(m.Key) + if l > 0 { + n += 1 + l + sovPb(uint64(l)) + } + if m.Offset != 0 { + n += 1 + sovPb(uint64(m.Offset)) + } + if m.Len != 0 { + n += 1 + sovPb(uint64(m.Len)) + } + if m.XXX_unrecognized != nil { + n += len(m.XXX_unrecognized) + } + return n +} + +func sovPb(x uint64) (n int) { + return (math_bits.Len64(x|1) + 6) / 7 +} +func sozPb(x uint64) (n int) { + return sovPb(uint64((x << 1) ^ uint64((int64(x) >> 63)))) +} +func (m *KV) Unmarshal(dAtA []byte) error { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: KV: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: KV: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Key", wireType) + } + var byteLen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + byteLen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if byteLen < 0 { + return ErrInvalidLengthPb + } + postIndex := iNdEx + byteLen + if postIndex < 0 { + return ErrInvalidLengthPb + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Key = append(m.Key[:0], dAtA[iNdEx:postIndex]...) + if m.Key == nil { + m.Key = []byte{} + } + iNdEx = postIndex + case 2: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Value", wireType) + } + var byteLen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + byteLen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if byteLen < 0 { + return ErrInvalidLengthPb + } + postIndex := iNdEx + byteLen + if postIndex < 0 { + return ErrInvalidLengthPb + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Value = append(m.Value[:0], dAtA[iNdEx:postIndex]...) + if m.Value == nil { + m.Value = []byte{} + } + iNdEx = postIndex + case 3: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field UserMeta", wireType) + } + var byteLen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + byteLen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if byteLen < 0 { + return ErrInvalidLengthPb + } + postIndex := iNdEx + byteLen + if postIndex < 0 { + return ErrInvalidLengthPb + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.UserMeta = append(m.UserMeta[:0], dAtA[iNdEx:postIndex]...) + if m.UserMeta == nil { + m.UserMeta = []byte{} + } + iNdEx = postIndex + case 4: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Version", wireType) + } + m.Version = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.Version |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 5: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field ExpiresAt", wireType) + } + m.ExpiresAt = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.ExpiresAt |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 6: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Meta", wireType) + } + var byteLen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + byteLen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if byteLen < 0 { + return ErrInvalidLengthPb + } + postIndex := iNdEx + byteLen + if postIndex < 0 { + return ErrInvalidLengthPb + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Meta = append(m.Meta[:0], dAtA[iNdEx:postIndex]...) + if m.Meta == nil { + m.Meta = []byte{} + } + iNdEx = postIndex + case 10: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field StreamId", wireType) + } + m.StreamId = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.StreamId |= uint32(b&0x7F) << shift + if b < 0x80 { + break + } + } + default: + iNdEx = preIndex + skippy, err := skipPb(dAtA[iNdEx:]) + if err != nil { + return err + } + if (skippy < 0) || (iNdEx+skippy) < 0 { + return ErrInvalidLengthPb + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} +func (m *KVList) Unmarshal(dAtA []byte) error { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: KVList: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: KVList: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Kv", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthPb + } + postIndex := iNdEx + msglen + if postIndex < 0 { + return ErrInvalidLengthPb + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Kv = append(m.Kv, &KV{}) + if err := m.Kv[len(m.Kv)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex + default: + iNdEx = preIndex + skippy, err := skipPb(dAtA[iNdEx:]) + if err != nil { + return err + } + if (skippy < 0) || (iNdEx+skippy) < 0 { + return ErrInvalidLengthPb + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} +func (m *ManifestChangeSet) Unmarshal(dAtA []byte) error { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: ManifestChangeSet: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: ManifestChangeSet: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Changes", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthPb + } + postIndex := iNdEx + msglen + if postIndex < 0 { + return ErrInvalidLengthPb + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Changes = append(m.Changes, &ManifestChange{}) + if err := m.Changes[len(m.Changes)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex + default: + iNdEx = preIndex + skippy, err := skipPb(dAtA[iNdEx:]) + if err != nil { + return err + } + if (skippy < 0) || (iNdEx+skippy) < 0 { + return ErrInvalidLengthPb + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} +func (m *ManifestChange) Unmarshal(dAtA []byte) error { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: ManifestChange: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: ManifestChange: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Id", wireType) + } + m.Id = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.Id |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 2: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Op", wireType) + } + m.Op = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.Op |= ManifestChange_Operation(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 3: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Level", wireType) + } + m.Level = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.Level |= uint32(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 4: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Checksum", wireType) + } + var byteLen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + byteLen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if byteLen < 0 { + return ErrInvalidLengthPb + } + postIndex := iNdEx + byteLen + if postIndex < 0 { + return ErrInvalidLengthPb + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Checksum = append(m.Checksum[:0], dAtA[iNdEx:postIndex]...) + if m.Checksum == nil { + m.Checksum = []byte{} + } + iNdEx = postIndex + default: + iNdEx = preIndex + skippy, err := skipPb(dAtA[iNdEx:]) + if err != nil { + return err + } + if (skippy < 0) || (iNdEx+skippy) < 0 { + return ErrInvalidLengthPb + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} +func (m *TableIndex) Unmarshal(dAtA []byte) error { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: TableIndex: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: TableIndex: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Offsets", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthPb + } + postIndex := iNdEx + msglen + if postIndex < 0 { + return ErrInvalidLengthPb + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Offsets = append(m.Offsets, &BlockOffset{}) + if err := m.Offsets[len(m.Offsets)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex + case 2: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field BloomFilter", wireType) + } + var byteLen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + byteLen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if byteLen < 0 { + return ErrInvalidLengthPb + } + postIndex := iNdEx + byteLen + if postIndex < 0 { + return ErrInvalidLengthPb + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.BloomFilter = append(m.BloomFilter[:0], dAtA[iNdEx:postIndex]...) + if m.BloomFilter == nil { + m.BloomFilter = []byte{} + } + iNdEx = postIndex + case 3: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field MaxVersion", wireType) + } + m.MaxVersion = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.MaxVersion |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 4: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field KeyCount", wireType) + } + m.KeyCount = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.KeyCount |= uint32(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 5: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field StaleDataSize", wireType) + } + m.StaleDataSize = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.StaleDataSize |= uint32(b&0x7F) << shift + if b < 0x80 { + break + } + } + default: + iNdEx = preIndex + skippy, err := skipPb(dAtA[iNdEx:]) + if err != nil { + return err + } + if (skippy < 0) || (iNdEx+skippy) < 0 { + return ErrInvalidLengthPb + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} +func (m *BlockOffset) Unmarshal(dAtA []byte) error { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: BlockOffset: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: BlockOffset: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Key", wireType) + } + var byteLen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + byteLen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if byteLen < 0 { + return ErrInvalidLengthPb + } + postIndex := iNdEx + byteLen + if postIndex < 0 { + return ErrInvalidLengthPb + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Key = append(m.Key[:0], dAtA[iNdEx:postIndex]...) + if m.Key == nil { + m.Key = []byte{} + } + iNdEx = postIndex + case 2: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Offset", wireType) + } + m.Offset = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.Offset |= uint32(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 3: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Len", wireType) + } + m.Len = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.Len |= uint32(b&0x7F) << shift + if b < 0x80 { + break + } + } + default: + iNdEx = preIndex + skippy, err := skipPb(dAtA[iNdEx:]) + if err != nil { + return err + } + if (skippy < 0) || (iNdEx+skippy) < 0 { + return ErrInvalidLengthPb + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} +func skipPb(dAtA []byte) (n int, err error) { + l := len(dAtA) + iNdEx := 0 + depth := 0 + for iNdEx < l { + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowPb + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + wireType := int(wire & 0x7) + switch wireType { + case 0: + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowPb + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + iNdEx++ + if dAtA[iNdEx-1] < 0x80 { + break + } + } + case 1: + iNdEx += 8 + case 2: + var length int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowPb + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + length |= (int(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + if length < 0 { + return 0, ErrInvalidLengthPb + } + iNdEx += length + case 3: + depth++ + case 4: + if depth == 0 { + return 0, ErrUnexpectedEndOfGroupPb + } + depth-- + case 5: + iNdEx += 4 + default: + return 0, fmt.Errorf("proto: illegal wireType %d", wireType) + } + if iNdEx < 0 { + return 0, ErrInvalidLengthPb + } + if depth == 0 { + return iNdEx, nil + } + } + return 0, io.ErrUnexpectedEOF +} + +var ( + ErrInvalidLengthPb = fmt.Errorf("proto: negative length found during unmarshaling") + ErrIntOverflowPb = fmt.Errorf("proto: integer overflow") + ErrUnexpectedEndOfGroupPb = fmt.Errorf("proto: unexpected end of group") +) diff --git a/pb/pb.proto b/pb/pb.proto new file mode 100644 index 0000000..63c9408 --- /dev/null +++ b/pb/pb.proto @@ -0,0 +1,64 @@ +/* + * Copyright hardcore-os Project Authors + * + * Licensed under the Apache License, Version 2.0 (the "License") + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// Use protos/gen.sh to generate .pb.go files. +syntax = "proto3"; + +package pb; + +message KV { + bytes key = 1; + bytes value = 2; + bytes user_meta = 3; + uint64 version = 4; + uint64 expires_at = 5; + bytes meta = 6; + + // Stream id is used to identify which stream the KV came from. + uint32 stream_id = 10; +} + +message KVList { + repeated KV kv = 1; +} + +message ManifestChangeSet { + // A set of changes that are applied atomically. + repeated ManifestChange changes = 1; +} + +message ManifestChange { + uint64 Id = 1; + enum Operation { + CREATE = 0; + DELETE = 1; + } + Operation Op = 2; + uint32 Level = 3; // Only used for CREATE + bytes Checksum = 4; // Only used for CREATE +} +message TableIndex{ + repeated BlockOffset offsets = 1; + bytes bloomFilter = 2; + uint64 maxVersion = 3; + uint32 keyCount = 4; + uint32 staleDataSize = 5; +} + +message BlockOffset{ + bytes key = 1; + uint32 offset = 2; + uint32 len = 3; +} \ No newline at end of file diff --git a/stats.go b/stats.go index 5c81362..2d935a2 100644 --- a/stats.go +++ b/stats.go @@ -1,3 +1,17 @@ +// Copyright 2021 logicrec Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package corekv import "github.com/hardcore-os/corekv/utils" @@ -17,7 +31,8 @@ func (s *Stats) StartStats() { defer s.closer.Done() for { select { - case <-s.closer.Wait(): + case <-s.closer.CloseSignal: + return } // stats logic... } @@ -26,7 +41,7 @@ func (s *Stats) StartStats() { // NewStats func newStats(opt *Options) *Stats { s := &Stats{} - s.closer = utils.NewCloser(1) - s.EntryNum = 1 // 这里直接写1 + s.closer = utils.NewCloser() + s.EntryNum = 1 // 这里直接写 return s } diff --git a/utils/arena.go b/utils/arena.go new file mode 100644 index 0000000..48d357d --- /dev/null +++ b/utils/arena.go @@ -0,0 +1,158 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package utils + +import ( + "github.com/pkg/errors" + "log" + "sync/atomic" + "unsafe" +) + +const ( + offsetSize = int(unsafe.Sizeof(uint32(0))) + + // Always align nodes on 64-bit boundaries, even on 32-bit architectures, + // so that the node.value field is 64-bit aligned. This is necessary because + // node.getValueOffset uses atomic.LoadUint64, which expects its input + // pointer to be 64-bit aligned. + nodeAlign = int(unsafe.Sizeof(uint64(0))) - 1 + + MaxNodeSize = int(unsafe.Sizeof(node{})) +) + +// Arena should be lock-free. +type Arena struct { + n uint32 + shouldGrow bool + buf []byte +} + +// newArena returns a new arena. +func newArena(n int64) *Arena { + // Don't store data at position 0 in order to reserve offset=0 as a kind + // of nil pointer. + out := &Arena{ + n: 1, + buf: make([]byte, n), + } + return out +} + +func (s *Arena) allocate(sz uint32) uint32 { + offset := atomic.AddUint32(&s.n, sz) + if !s.shouldGrow { + AssertTrue(int(offset) <= len(s.buf)) + return offset - sz + } + + // We are keeping extra bytes in the end so that the checkptr doesn't fail. We apply some + // intelligence to reduce the size of the node by only keeping towers upto valid height and not + // maxHeight. This reduces the node's size, but checkptr doesn't know about its reduced size. + // checkptr tries to verify that the node of size MaxNodeSize resides on a single heap + // allocation which causes this error: checkptr:converted pointer straddles multiple allocations + if int(offset) > len(s.buf)-MaxNodeSize { + growBy := uint32(len(s.buf)) + if growBy > 1<<30 { + growBy = 1 << 30 + } + if growBy < sz { + growBy = sz + } + newBuf := make([]byte, len(s.buf)+int(growBy)) + AssertTrue(len(s.buf) == copy(newBuf, s.buf)) + s.buf = newBuf + } + return offset - sz +} + +func (s *Arena) size() int64 { + return int64(atomic.LoadUint32(&s.n)) +} + +// putNode allocates a node in the arena. The node is aligned on a pointer-sized +// boundary. The arena offset of the node is returned. +func (s *Arena) putNode(height int) uint32 { + // Compute the amount of the tower that will never be used, since the height + // is less than maxHeight. + unusedSize := (maxHeight - height) * offsetSize + + // Pad the allocation with enough bytes to ensure pointer alignment. + l := uint32(MaxNodeSize - unusedSize + nodeAlign) + n := s.allocate(l) + + // Return the aligned offset. + m := (n + uint32(nodeAlign)) & ^uint32(nodeAlign) + return m +} + +// Put will *copy* val into arena. To make better use of this, reuse your input +// val buffer. Returns an offset into buf. User is responsible for remembering +// size of val. We could also store this size inside arena but the encoding and +// decoding will incur some overhead. +func (s *Arena) putVal(v ValueStruct) uint32 { + l := uint32(v.EncodedSize()) + offset := s.allocate(l) + v.EncodeValue(s.buf[offset:]) + return offset +} + +func (s *Arena) putKey(key []byte) uint32 { + keySz := uint32(len(key)) + offset := s.allocate(keySz) + buf := s.buf[offset : offset+keySz] + AssertTrue(len(key) == copy(buf, key)) + return offset +} + +// getNode returns a pointer to the node located at offset. If the offset is +// zero, then the nil node pointer is returned. +func (s *Arena) getNode(offset uint32) *node { + if offset == 0 { + return nil + } + return (*node)(unsafe.Pointer(&s.buf[offset])) +} + +// getKey returns byte slice at offset. +func (s *Arena) getKey(offset uint32, size uint16) []byte { + return s.buf[offset : offset+uint32(size)] +} + +// getVal returns byte slice at offset. The given size should be just the value +// size and should NOT include the meta bytes. +func (s *Arena) getVal(offset uint32, size uint32) (ret ValueStruct) { + ret.DecodeValue(s.buf[offset : offset+size]) + return +} + +// getNodeOffset returns the offset of node in the arena. If the node pointer is +// nil, then the zero offset is returned. +func (s *Arena) getNodeOffset(nd *node) uint32 { + if nd == nil { + return 0 + } + + return uint32(uintptr(unsafe.Pointer(nd)) - uintptr(unsafe.Pointer(&s.buf[0]))) +} + +// AssertTrue asserts that b is true. Otherwise, it would log fatal. +func AssertTrue(b bool) { + if !b { + log.Fatalf("%+v", errors.Errorf("Assert failed")) + } +} diff --git a/utils/bloom.go b/utils/bloom.go new file mode 100644 index 0000000..8ffb9c8 --- /dev/null +++ b/utils/bloom.go @@ -0,0 +1,131 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package utils + +import "math" + +// Filter is an encoded set of []byte keys. +type Filter []byte + +// MayContainKey _ +func (f Filter) MayContainKey(k []byte) bool { + return f.MayContain(Hash(k)) +} + +// MayContain returns whether the filter may contain given key. False positives +// are possible, where it returns true for keys not in the original set. +func (f Filter) MayContain(h uint32) bool { + if len(f) < 2 { + return false + } + k := f[len(f)-1] + if k > 30 { + // This is reserved for potentially new encodings for short Bloom filters. + // Consider it a match. + return true + } + nBits := uint32(8 * (len(f) - 1)) + delta := h>>17 | h<<15 + for j := uint8(0); j < k; j++ { + bitPos := h % nBits + if f[bitPos/8]&(1<<(bitPos%8)) == 0 { + return false + } + h += delta + } + return true +} + +// NewFilter returns a new Bloom filter that encodes a set of []byte keys with +// the given number of bits per key, approximately. +// +// A good bitsPerKey value is 10, which yields a filter with ~ 1% false +// positive rate. +func NewFilter(keys []uint32, bitsPerKey int) Filter { + return Filter(appendFilter(keys, bitsPerKey)) +} + +// BloomBitsPerKey returns the bits per key required by bloomfilter based on +// the false positive rate. +func BloomBitsPerKey(numEntries int, fp float64) int { + size := -1 * float64(numEntries) * math.Log(fp) / math.Pow(float64(0.69314718056), 2) + locs := math.Ceil(size / float64(numEntries)) + return int(locs) +} + +func appendFilter(keys []uint32, bitsPerKey int) []byte { + if bitsPerKey < 0 { + bitsPerKey = 0 + } + // 0.69 is approximately ln(2). + k := uint32(float64(bitsPerKey) * 0.69) + if k < 1 { + k = 1 + } + if k > 30 { + k = 30 + } + + nBits := len(keys) * int(bitsPerKey) + // For small len(keys), we can see a very high false positive rate. Fix it + // by enforcing a minimum bloom filter length. + if nBits < 64 { + nBits = 64 + } + nBytes := (nBits + 7) / 8 + nBits = nBytes * 8 + filter := make([]byte, nBytes+1) + + for _, h := range keys { + delta := h>>17 | h<<15 + for j := uint32(0); j < k; j++ { + bitPos := h % uint32(nBits) + filter[bitPos/8] |= 1 << (bitPos % 8) + h += delta + } + } + + //record the K value of this Bloom Filter + filter[nBytes] = uint8(k) + + return filter +} + +// Hash implements a hashing algorithm similar to the Murmur hash. +func Hash(b []byte) uint32 { + const ( + seed = 0xbc9f1d34 + m = 0xc6a4a793 + ) + h := uint32(seed) ^ uint32(len(b))*m + for ; len(b) >= 4; b = b[4:] { + h += uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 + h *= m + h ^= h >> 16 + } + switch len(b) { + case 3: + h += uint32(b[2]) << 16 + fallthrough + case 2: + h += uint32(b[1]) << 8 + fallthrough + case 1: + h += uint32(b[0]) + h *= m + h ^= h >> 24 + } + return h +} diff --git a/utils/bloom_test.go b/utils/bloom_test.go new file mode 100644 index 0000000..339affe --- /dev/null +++ b/utils/bloom_test.go @@ -0,0 +1,156 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package utils + +import ( + "testing" +) + +func (f Filter) String() string { + s := make([]byte, 8*len(f)) + for i, x := range f { + for j := 0; j < 8; j++ { + if x&(1<> 0) + b[1] = uint8(uint32(i) >> 8) + b[2] = uint8(uint32(i) >> 16) + b[3] = uint8(uint32(i) >> 24) + return b + } + + nMediocreFilters, nGoodFilters := 0, 0 +loop: + for length := 1; length <= 10000; length = nextLength(length) { + keys := make([][]byte, 0, length) + for i := 0; i < length; i++ { + keys = append(keys, le32(i)) + } + var hashes []uint32 + for _, key := range keys { + hashes = append(hashes, Hash(key)) + } + f := NewFilter(hashes, 10) + + if len(f) > (length*10/8)+40 { + t.Errorf("length=%d: len(f)=%d is too large", length, len(f)) + continue + } + + // All added keys must match. + for _, key := range keys { + if !f.MayContainKey(key) { + t.Errorf("length=%d: did not contain key %q", length, key) + continue loop + } + } + + // Check false positive rate. + nFalsePositive := 0 + for i := 0; i < 10000; i++ { + if f.MayContainKey(le32(1e9 + i)) { + nFalsePositive++ + } + } + if nFalsePositive > 0.02*10000 { + t.Errorf("length=%d: %d false positives in 10000", length, nFalsePositive) + continue + } + if nFalsePositive > 0.0125*10000 { + nMediocreFilters++ + } else { + nGoodFilters++ + } + } + + if nMediocreFilters > nGoodFilters/5 { + t.Errorf("%d mediocre filters but only %d good filters", nMediocreFilters, nGoodFilters) + } +} + +func TestHash(t *testing.T) { + // The magic want numbers come from running the C++ leveldb code in hash.cc. + testCases := []struct { + s string + want uint32 + }{ + {"", 0xbc9f1d34}, + {"g", 0xd04a8bda}, + {"go", 0x3e0b0745}, + {"gop", 0x0c326610}, + {"goph", 0x8c9d6390}, + {"gophe", 0x9bfd4b0a}, + {"gopher", 0xa78edc7c}, + {"I had a dream it would end this way.", 0xe14a9db9}, + } + for _, tc := range testCases { + if got := Hash([]byte(tc.s)); got != tc.want { + t.Errorf("s=%q: got 0x%08x, want 0x%08x", tc.s, got, tc.want) + } + } +} diff --git a/utils/cache/bloom.go b/utils/cache/bloom.go new file mode 100644 index 0000000..2bacc2d --- /dev/null +++ b/utils/cache/bloom.go @@ -0,0 +1,187 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cache + +import "math" + +// Filter is an encoded set of []byte keys. +type Filter []byte + +type BloomFilter struct { + bitmap Filter + k uint8 +} + +// MayContainKey _ +func (f *BloomFilter) MayContainKey(k []byte) bool { + return f.MayContain(Hash(k)) +} + +// MayContain returns whether the filter may contain given key. False positives +// are possible, where it returns true for keys not in the original set. +func (f *BloomFilter) MayContain(h uint32) bool { + if f.Len() < 2 { + return false + } + k := f.k + if k > 30 { + // This is reserved for potentially new encodings for short Bloom filters. + // Consider it a match. + return true + } + nBits := uint32(8 * (f.Len() - 1)) + delta := h>>17 | h<<15 + for j := uint8(0); j < k; j++ { + bitPos := h % nBits + if f.bitmap[bitPos/8]&(1<<(bitPos%8)) == 0 { + return false + } + h += delta + } + return true +} + +func (f *BloomFilter) Len() int32 { + return int32(len(f.bitmap)) +} + +func (f *BloomFilter) InsertKey(k []byte) bool { + return f.Insert(Hash(k)) +} + +func (f *BloomFilter) Insert(h uint32) bool { + k := f.k + if k > 30 { + // This is reserved for potentially new encodings for short Bloom filters. + // Consider it a match. + return true + } + nBits := uint32(8 * (f.Len() - 1)) + delta := h>>17 | h<<15 + for j := uint8(0); j < k; j++ { + bitPos := h % uint32(nBits) + f.bitmap[bitPos/8] |= 1 << (bitPos % 8) + h += delta + } + return true +} + +func (f *BloomFilter) AllowKey(k []byte) bool { + if f == nil { + return true + } + already := f.MayContainKey(k) + if !already { + f.InsertKey(k) + } + return already +} + +func (f *BloomFilter) Allow(h uint32) bool { + if f == nil { + return true + } + already := f.MayContain(h) + if !already { + f.Insert(h) + } + return already +} + +func (f *BloomFilter) reset() { + if f == nil { + return + } + for i := range f.bitmap { + f.bitmap[i] = 0 + } +} + +// NewFilter returns a new Bloom filter that encodes a set of []byte keys with +// the given number of bits per key, approximately. +// +// A good bitsPerKey value is 10, which yields a filter with ~ 1% false +// positive rate. +func newFilter(numEntries int, falsePositive float64) *BloomFilter { + bitsPerKey := bloomBitsPerKey(numEntries, falsePositive) + return initFilter(numEntries, bitsPerKey) +} + +// BloomBitsPerKey returns the bits per key required by bloomfilter based on +// the false positive rate. +func bloomBitsPerKey(numEntries int, fp float64) int { + size := -1 * float64(numEntries) * math.Log(fp) / math.Pow(float64(0.69314718056), 2) + locs := math.Ceil(size / float64(numEntries)) + return int(locs) +} + +func initFilter(numEntries int, bitsPerKey int) *BloomFilter { + bf := &BloomFilter{} + if bitsPerKey < 0 { + bitsPerKey = 0 + } + // 0.69 is approximately ln(2). + k := uint32(float64(bitsPerKey) * 0.69) + if k < 1 { + k = 1 + } + if k > 30 { + k = 30 + } + bf.k = uint8(k) + + nBits := numEntries * int(bitsPerKey) + // For small len(keys), we can see a very high false positive rate. Fix it + // by enforcing a minimum bloom filter length. + if nBits < 64 { + nBits = 64 + } + nBytes := (nBits + 7) / 8 + nBits = nBytes * 8 + filter := make([]byte, nBytes+1) + + //record the K value of this Bloom Filter + filter[nBytes] = uint8(k) + + bf.bitmap = filter + return bf +} + +// Hash implements a hashing algorithm similar to the Murmur hash. +func Hash(b []byte) uint32 { + const ( + seed = 0xbc9f1d34 + m = 0xc6a4a793 + ) + h := uint32(seed) ^ uint32(len(b))*m + for ; len(b) >= 4; b = b[4:] { + h += uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 + h *= m + h ^= h >> 16 + } + switch len(b) { + case 3: + h += uint32(b[2]) << 16 + fallthrough + case 2: + h += uint32(b[1]) << 8 + fallthrough + case 1: + h += uint32(b[0]) + h *= m + h ^= h >> 24 + } + return h +} diff --git a/utils/cache/cache.go b/utils/cache/cache.go new file mode 100644 index 0000000..1ffd56a --- /dev/null +++ b/utils/cache/cache.go @@ -0,0 +1,214 @@ +package cache + +import ( + "container/list" + xxhash "github.com/cespare/xxhash/v2" + "sync" + "unsafe" +) + +type Cache struct { + m sync.RWMutex + lru *windowLRU + slru *segmentedLRU + door *BloomFilter + c *cmSketch + t int32 + threshold int32 + data map[uint64]*list.Element +} + +type Options struct { + lruPct uint8 +} + +func NewCache(size int) *Cache { + const lruPct = 1 + lruSz := (lruPct * size) / 100 + + if lruSz < 1 { + lruSz = 1 + } + + slruSz := int(float64(size) * ((100 - lruPct) / 100.0)) + + if slruSz < 1 { + slruSz = 1 + } + + slruO := int(0.2 * float64(slruSz)) + + if slruO < 1 { + slruO = 1 + } + + data := make(map[uint64]*list.Element, size) + + return &Cache{ + lru: newWindowLRU(lruSz, data), + slru: newSLRU(data, slruO, slruSz-slruO), + door: newFilter(size, 0.01), + c: newCmSketch(int64(size)), + data: data, + } + +} + +func (c *Cache) Set(key interface{}, value interface{}) bool { + c.m.Lock() + defer c.m.Unlock() + return c.set(key, value) +} + +func (c *Cache) set(key, value interface{}) bool { + keyHash, conflictHash := c.keyToHash(key) + + i := storeItem{ + stage: 0, + key: keyHash, + conflict: conflictHash, + value: value, + } + + eitem, evicted := c.lru.add(i) + + if !evicted { + return true + } + + victim := c.slru.victim() + + if victim == nil { + c.slru.add(eitem) + return true + } + + if !c.door.Allow(uint32(keyHash)) { + return true + } + + vcount := c.c.Estimate(victim.key) + ocount := c.c.Estimate(eitem.key) + + if ocount < vcount { + return true + } + + c.slru.add(eitem) + return true +} + +func (c *Cache) Get(key interface{}) (interface{}, bool) { + c.m.RLock() + defer c.m.RUnlock() + return c.get(key) +} + +func (c *Cache) get(key interface{}) (interface{}, bool) { + c.t++ + if c.t == c.threshold { + c.c.Reset() + c.door.reset() + c.t = 0 + } + + keyHash, conflictHash := c.keyToHash(key) + + val, ok := c.data[keyHash] + if !ok { + c.c.Increment(keyHash) + return nil, false + } + + item := val.Value.(*storeItem) + + if item.conflict != conflictHash { + c.c.Increment(keyHash) + return nil, false + } + + c.c.Increment(item.key) + + v := item.value + + if item.stage == 0 { + c.lru.get(val) + } else { + c.slru.get(val) + } + + return v, true + +} + +func (c *Cache) Del(key interface{}) (interface{}, bool) { + c.m.Lock() + defer c.m.Unlock() + return c.del(key) +} + +func (c *Cache) del(key interface{}) (interface{}, bool) { + keyHash, conflictHash := c.keyToHash(key) + + val, ok := c.data[keyHash] + if !ok { + return 0, false + } + + item := val.Value.(*storeItem) + + if conflictHash != 0 && (conflictHash != item.conflict) { + return 0, false + } + + delete(c.data, keyHash) + return item.conflict, true +} + +func (c *Cache) keyToHash(key interface{}) (uint64, uint64) { + if key == nil { + return 0, 0 + } + switch k := key.(type) { + case uint64: + return k, 0 + case string: + return MemHashString(k), xxhash.Sum64String(k) + case []byte: + return MemHash(k), xxhash.Sum64(k) + case byte: + return uint64(k), 0 + case int: + return uint64(k), 0 + case int32: + return uint64(k), 0 + case uint32: + return uint64(k), 0 + case int64: + return uint64(k), 0 + default: + panic("Key type not supported") + } +} + +type stringStruct struct { + str unsafe.Pointer + len int +} + +//go:noescape +//go:linkname memhash runtime.memhash +func memhash(p unsafe.Pointer, h, s uintptr) uintptr + +// MemHashString is the hash function used by go map, it utilizes available hardware instructions +// (behaves as aeshash if aes instruction is available). +// NOTE: The hash seed changes for every process. So, this cannot be used as a persistent hash. +func MemHashString(str string) uint64 { + ss := (*stringStruct)(unsafe.Pointer(&str)) + return uint64(memhash(ss.str, 0, uintptr(ss.len))) +} + +func MemHash(data []byte) uint64 { + ss := (*stringStruct)(unsafe.Pointer(&data)) + return uint64(memhash(ss.str, 0, uintptr(ss.len))) +} diff --git a/utils/cache/cache.s b/utils/cache/cache.s new file mode 100644 index 0000000..e69de29 diff --git a/utils/cache/cache_test.go b/utils/cache/cache_test.go new file mode 100644 index 0000000..68e54fe --- /dev/null +++ b/utils/cache/cache_test.go @@ -0,0 +1,28 @@ +package cache + +import ( + "fmt" + "github.com/stretchr/testify/assert" + "testing" +) + +func TestCacheBasicCRUD(t *testing.T) { + cache := NewCache(5) + for i := 0; i < 10; i++ { + key := fmt.Sprintf("key%d", i) + val := fmt.Sprintf("val%d", i) + cache.Set(key, val) + } + + for i := 0; i < 1000; i++ { + key := fmt.Sprintf("key%d", i) + val := fmt.Sprintf("val%d", i) + res, ok := cache.Get(key) + if ok { + assert.Equal(t, val, res) + continue + } + assert.Equal(t, res, nil) + + } +} diff --git a/utils/cache/cmSketch.go b/utils/cache/cmSketch.go new file mode 100644 index 0000000..e812139 --- /dev/null +++ b/utils/cache/cmSketch.go @@ -0,0 +1,118 @@ +package cache + +import ( + "fmt" + "math/rand" + "time" +) + +const ( + cmDepth = 4 +) + +type cmSketch struct { + rows [cmDepth]cmRow + seed [cmDepth]uint64 + mask uint64 +} + +func newCmSketch(numCounters int64) *cmSketch { + if numCounters == 0 { + panic("cmSketch: invalid numCounters") + } + + numCounters = next2Power(numCounters) + sketch := &cmSketch{mask: uint64(numCounters - 1)} + source := rand.New(rand.NewSource(time.Now().UnixNano())) + + for i := 0; i < cmDepth; i++ { + sketch.seed[i] = source.Uint64() + sketch.rows[i] = newCmRow(numCounters) + } + + return sketch +} + +func (s *cmSketch) Increment(hashed uint64) { + for i := range s.rows { + s.rows[i].increment((hashed ^ s.seed[i]) & s.mask) + } +} + +func (s *cmSketch) Estimate(hashed uint64) int64 { + min := byte(255) + for i := range s.rows { + val := s.rows[i].get((hashed ^ s.seed[i]) & s.mask) + if val < min { + min = val + } + } + + return int64(min) +} + +// Reset halves all counter values. +func (s *cmSketch) Reset() { + for _, r := range s.rows { + r.reset() + } +} + +// Clear zeroes all counters. +func (s *cmSketch) Clear() { + for _, r := range s.rows { + r.clear() + } +} + +func next2Power(x int64) int64 { + x-- + x |= x >> 1 + x |= x >> 2 + x |= x >> 4 + x |= x >> 8 + x |= x >> 16 + x |= x >> 32 + x++ + return x +} + +type cmRow []byte + +func newCmRow(numCounters int64) cmRow { + return make(cmRow, numCounters/2) +} + +func (r cmRow) get(n uint64) byte { + return r[n/2] >> ((n & 1) * 4) & 0x0f +} + +func (r cmRow) increment(n uint64) { + i := n / 2 + s := (n & 1) * 4 + v := (r[i] >> s) & 0x0f + if v < 15 { + r[i] += 1 << s + } +} + +func (r cmRow) reset() { + for i := range r { + r[i] = (r[i] >> 1) & 0x77 + } +} + +func (r cmRow) clear() { + for i := range r { + r[i] = 0 + } +} + +func (r cmRow) string() string { + s := "" + for i := uint64(0); i < uint64(len(r)*2); i++ { + s += fmt.Sprintf("%02d ", (r[(i/2)]>>((i&1)*4))&0x0f) + } + s = s[:len(s)-1] + return s +} diff --git a/utils/cache/lru.go b/utils/cache/lru.go new file mode 100644 index 0000000..21d0e94 --- /dev/null +++ b/utils/cache/lru.go @@ -0,0 +1,46 @@ +package cache + +import "container/list" + +type windowLRU struct { + data map[uint64]*list.Element + cap int + list *list.List +} + +type storeItem struct { + stage int + key uint64 + conflict uint64 + value interface{} +} + +func newWindowLRU(size int, data map[uint64]*list.Element) *windowLRU { + return &windowLRU{ + data: data, + cap: size, + list: list.New(), + } +} + +func (lru *windowLRU) add(newitem storeItem) (eitem storeItem, evicted bool) { + if lru.list.Len() < lru.cap { + lru.data[newitem.key] = lru.list.PushFront(&newitem) + return storeItem{}, false + } + + evictItem := lru.list.Back() + item := evictItem.Value.(*storeItem) + + delete(lru.data, item.key) + + eitem, *item = *item, newitem + + lru.data[item.key] = evictItem + lru.list.MoveToFront(evictItem) + return eitem, true +} + +func (lru *windowLRU) get(v *list.Element) { + lru.list.MoveToFront(v) +} diff --git a/utils/cache/s2lru.go b/utils/cache/s2lru.go new file mode 100644 index 0000000..bd5e798 --- /dev/null +++ b/utils/cache/s2lru.go @@ -0,0 +1,86 @@ +package cache + +import "container/list" + +type segmentedLRU struct { + data map[uint64]*list.Element + stageOneCap, stageTwoCap int + stageOne, stageTwo *list.List +} + +const ( + STAGE_ONE = iota + STAGE_TWO +) + +func newSLRU(data map[uint64]*list.Element, stageOneCap, stageTwoCap int) *segmentedLRU { + return &segmentedLRU{ + data: data, + stageOneCap: stageOneCap, + stageTwoCap: stageTwoCap, + stageOne: list.New(), + stageTwo: list.New(), + } +} + +func (slru *segmentedLRU) add(newitem storeItem) { + newitem.stage = 1 + + if slru.stageOne.Len() < slru.stageOneCap || slru.Len() < slru.stageOneCap+slru.stageTwoCap { + slru.data[newitem.key] = slru.stageOne.PushFront(&newitem) + return + } + + e := slru.stageOne.Back() + item := e.Value.(*storeItem) + + delete(slru.data, item.key) + + *item = newitem + + slru.data[item.key] = e + slru.stageOne.MoveToFront(e) +} + +func (slru *segmentedLRU) get(v *list.Element) { + item := v.Value.(*storeItem) + + if item.stage == STAGE_TWO { + slru.stageTwo.MoveToFront(v) + return + } + + if slru.stageTwo.Len() < slru.stageTwoCap { + slru.stageOne.Remove(v) + item.stage = STAGE_TWO + slru.data[item.key] = slru.stageTwo.PushFront(item) + return + } + + back := slru.stageTwo.Back() + bitem := back.Value.(*storeItem) + + *bitem, *item = *item, *bitem + + bitem.stage = STAGE_TWO + item.stage = STAGE_ONE + + slru.data[item.key] = v + slru.data[bitem.key] = back + + slru.stageOne.MoveToFront(v) + slru.stageTwo.MoveToFront(back) +} + +func (slru *segmentedLRU) Len() int { + return slru.stageTwo.Len() + slru.stageOne.Len() +} + +func (slru *segmentedLRU) victim() *storeItem { + if slru.Len() < slru.stageOneCap+slru.stageTwoCap { + return nil + } + + v := slru.stageOne.Back() + return v.Value.(*storeItem) +} diff --git a/utils/closer.go b/utils/closer.go index fe18fb9..0b9b708 100644 --- a/utils/closer.go +++ b/utils/closer.go @@ -1,24 +1,37 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package utils import "sync" -// 用于资源回收的信号控制 +// Closer _用于资源回收的信号控制 type Closer struct { waiting sync.WaitGroup - closeSignal chan struct{} + CloseSignal chan struct{} } -// NewCloser -func NewCloser(i int) *Closer { +// NewCloser _ +func NewCloser() *Closer { closer := &Closer{waiting: sync.WaitGroup{}} - closer.waiting.Add(i) - closer.closeSignal = make(chan struct{}) + closer.CloseSignal = make(chan struct{}) return closer } // Close 上游通知下游协程进行资源回收,并等待协程通知回收完毕 func (c *Closer) Close() { - close(c.closeSignal) + close(c.CloseSignal) c.waiting.Wait() } @@ -27,7 +40,7 @@ func (c *Closer) Done() { c.waiting.Done() } -// Wait 返回关闭信号 -func (c *Closer) Wait() chan struct{} { - return c.closeSignal +// Add 添加wait 计数 +func (c *Closer) Add(n int) { + c.waiting.Add(n) } diff --git a/utils/codec/codec.go b/utils/codec/codec.go deleted file mode 100644 index 9df1fc8..0000000 --- a/utils/codec/codec.go +++ /dev/null @@ -1,10 +0,0 @@ -package codec - -// WalCodec 写入wal文件的编码 -func WalCodec(entry *Entry) []byte { - return []byte{} -} - -func ValuePtrCodec(ptr *ValuePtr) []byte { - return []byte{} -} diff --git a/utils/codec/entry.go b/utils/codec/entry.go deleted file mode 100644 index 24d28f1..0000000 --- a/utils/codec/entry.go +++ /dev/null @@ -1,27 +0,0 @@ -package codec - -import ( - "time" -) - -type Entry struct { - Key []byte - Value []byte - ExpiresAt uint64 -} - -func NewEntry(key, value []byte) *Entry { - return &Entry{ - Key: key, - Value: value, - } -} - -func (e *Entry) WithTTL(dur time.Duration) *Entry { - e.ExpiresAt = uint64(time.Now().Add(dur).Unix()) - return e -} - -func (e *Entry) Size() int64 { - return int64(len(e.Key) + len(e.Value)) -} diff --git a/utils/codec/value.go b/utils/codec/value.go deleted file mode 100644 index 8dadd57..0000000 --- a/utils/codec/value.go +++ /dev/null @@ -1,19 +0,0 @@ -package codec - -type ValuePtr struct { -} - -// NewValuePtr -func NewValuePtr(entry *Entry) *ValuePtr { - return &ValuePtr{} -} - -// IsValuePtr -func IsValuePtr(entry *Entry) bool { - return false -} - -// ValuePtrDecode -func ValuePtrDecode(data []byte) *ValuePtr { - return nil -} diff --git a/utils/const.go b/utils/const.go index 67e50b1..dcbbb59 100644 --- a/utils/const.go +++ b/utils/const.go @@ -1,6 +1,61 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package utils +import ( + "hash/crc32" + "math" + "os" +) + const ( - MaxLevelNum = 7 + // MaxLevelNum _ + MaxLevelNum = 7 + // DefaultValueThreshold _ DefaultValueThreshold = 1024 ) + +// file +const ( + ManifestFilename = "MANIFEST" + ManifestRewriteFilename = "REWRITEMANIFEST" + ManifestDeletionsRewriteThreshold = 10000 + ManifestDeletionsRatio = 10 + DefaultFileFlag = os.O_RDWR | os.O_CREATE | os.O_APPEND + DefaultFileMode = 0666 + MaxValueLogSize = 10 << 20 + // This is O_DSYNC (datasync) on platforms that support it -- see file_unix.go + datasyncFileFlag = 0x0 + // 基于可变长编码,其最可能的编码 + MaxHeaderSize = 21 + VlogHeaderSize = 0 + MaxVlogFileSize uint32 = math.MaxUint32 + Mi int64 = 1 << 20 + KVWriteChCapacity = 1000 +) + +// meta +const ( + BitDelete byte = 1 << 0 // Set if the key has been deleted. + BitValuePointer byte = 1 << 1 // Set if the value is NOT stored directly next to key. +) + +// codec +var ( + MagicText = [4]byte{'H', 'A', 'R', 'D'} + MagicVersion = uint32(1) + // CastagnoliCrcTable is a CRC32 polynomial table + CastagnoliCrcTable = crc32.MakeTable(crc32.Castagnoli) +) diff --git a/utils/entry.go b/utils/entry.go new file mode 100644 index 0000000..a3d5929 --- /dev/null +++ b/utils/entry.go @@ -0,0 +1,185 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package utils + +import ( + "encoding/binary" + "time" +) + +type ValueStruct struct { + Meta byte + Value []byte + ExpiresAt uint64 + + Version uint64 // This field is not serialized. Only for internal usage. +} + +// value只持久化具体的value值和过期时间 +func (vs *ValueStruct) EncodedSize() uint32 { + sz := len(vs.Value) + 1 // meta + enc := sizeVarint(vs.ExpiresAt) + return uint32(sz + enc) +} + +// DecodeValue +func (vs *ValueStruct) DecodeValue(buf []byte) { + vs.Meta = buf[0] + var sz int + vs.ExpiresAt, sz = binary.Uvarint(buf[1:]) + vs.Value = buf[1+sz:] +} + +//对value进行编码,并将编码后的字节写入byte +//这里将过期时间和value的值一起编码 +func (vs *ValueStruct) EncodeValue(b []byte) uint32 { + b[0] = vs.Meta + sz := binary.PutUvarint(b[1:], vs.ExpiresAt) + n := copy(b[1+sz:], vs.Value) + return uint32(1 + sz + n) +} + +func sizeVarint(x uint64) (n int) { + for { + n++ + x >>= 7 + if x == 0 { + break + } + } + return n +} + +//Entry _ 最外层写入的结构体 +type Entry struct { + Key []byte + Value []byte + ExpiresAt uint64 + + Meta byte + Version uint64 + Offset uint32 + Hlen int // Length of the header. + ValThreshold int64 +} + +// NewEntry_ +func NewEntry(key, value []byte) *Entry { + return &Entry{ + Key: key, + Value: value, + } +} + +// Entry_ +func (e *Entry) Entry() *Entry { + return e +} + +func (e *Entry) IsDeletedOrExpired() bool { + if e.Value == nil { + return true + } + + if e.ExpiresAt == 0 { + return false + } + + return e.ExpiresAt <= uint64(time.Now().Unix()) +} + +// WithTTL _ +func (e *Entry) WithTTL(dur time.Duration) *Entry { + e.ExpiresAt = uint64(time.Now().Add(dur).Unix()) + return e +} + +// EncodedSize is the size of the ValueStruct when encoded +func (e *Entry) EncodedSize() uint32 { + sz := len(e.Value) + enc := sizeVarint(uint64(e.Meta)) + enc += sizeVarint(e.ExpiresAt) + return uint32(sz + enc) +} + +// EstimateSize +func (e *Entry) EstimateSize(threshold int) int { + // TODO: 是否考虑 user meta? + if len(e.Value) < threshold { + return len(e.Key) + len(e.Value) + 1 // Meta + } + return len(e.Key) + 12 + 1 // 12 for ValuePointer, 2 for meta. +} + +// header 对象 +// header is used in value log as a header before Entry. +type Header struct { + KLen uint32 + VLen uint32 + ExpiresAt uint64 + Meta byte +} + +// +------+----------+------------+--------------+-----------+ +// | Meta | UserMeta | Key Length | Value Length | ExpiresAt | +// +------+----------+------------+--------------+-----------+ +func (h Header) Encode(out []byte) int { + out[0] = h.Meta + index := 1 + index += binary.PutUvarint(out[index:], uint64(h.KLen)) + index += binary.PutUvarint(out[index:], uint64(h.VLen)) + index += binary.PutUvarint(out[index:], h.ExpiresAt) + return index +} + +// Decode decodes the given header from the provided byte slice. +// Returns the number of bytes read. +func (h *Header) Decode(buf []byte) int { + h.Meta = buf[0] + index := 1 + klen, count := binary.Uvarint(buf[index:]) + h.KLen = uint32(klen) + index += count + vlen, count := binary.Uvarint(buf[index:]) + h.VLen = uint32(vlen) + index += count + h.ExpiresAt, count = binary.Uvarint(buf[index:]) + return index + count +} + +// DecodeFrom reads the header from the hashReader. +// Returns the number of bytes read. +func (h *Header) DecodeFrom(reader *HashReader) (int, error) { + var err error + h.Meta, err = reader.ReadByte() + if err != nil { + return 0, err + } + klen, err := binary.ReadUvarint(reader) + if err != nil { + return 0, err + } + h.KLen = uint32(klen) + vlen, err := binary.ReadUvarint(reader) + if err != nil { + return 0, err + } + h.VLen = uint32(vlen) + h.ExpiresAt, err = binary.ReadUvarint(reader) + if err != nil { + return 0, err + } + return reader.BytesRead, nil +} diff --git a/utils/entry_test.go b/utils/entry_test.go new file mode 100644 index 0000000..5362a4e --- /dev/null +++ b/utils/entry_test.go @@ -0,0 +1,34 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package utils + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestValueStruct(t *testing.T) { + v := ValueStruct{ + Value: []byte("硬核课堂"), + Meta: 2, + ExpiresAt: 213123123123, + } + data := make([]byte, v.EncodedSize()) + v.EncodeValue(data) + var vv ValueStruct + vv.DecodeValue(data) + assert.Equal(t, vv, v) +} diff --git a/utils/error.go b/utils/error.go index 8c1ef19..279a50d 100644 --- a/utils/error.go +++ b/utils/error.go @@ -1,8 +1,131 @@ +// Copyright 2021 logicrec Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package utils -// Panic 如果err 不为nil 则panic +import ( + "errors" + "fmt" + "os" + "path" + "path/filepath" + "runtime" + "strconv" + "strings" +) + +var ( + gopath = path.Join(os.Getenv("GOPATH"), "src") + "/" +) + +// NotFoundKey 找不到key +var ( + // ErrKeyNotFound is returned when key isn't found on a txn.Get. + ErrKeyNotFound = errors.New("Key not found") + // ErrEmptyKey is returned if an empty key is passed on an update function. + ErrEmptyKey = errors.New("Key cannot be empty") + // ErrReWriteFailure reWrite failure + ErrReWriteFailure = errors.New("reWrite failure") + // ErrBadMagic bad magic + ErrBadMagic = errors.New("bad magic") + // ErrBadChecksum bad check sum + ErrBadChecksum = errors.New("bad check sum") + // ErrChecksumMismatch is returned at checksum mismatch. + ErrChecksumMismatch = errors.New("checksum mismatch") + + ErrTruncate = errors.New("Do truncate") + ErrStop = errors.New("Stop") + + // compact + ErrFillTables = errors.New("Unable to fill tables") + + ErrBlockedWrites = errors.New("Writes are blocked, possibly due to DropAll or Close") + ErrTxnTooBig = errors.New("Txn is too big to fit into one request") + ErrDeleteVlogFile = errors.New("Delete vlog file") + ErrNoRoom = errors.New("No room for write") + + // ErrInvalidRequest is returned if the user request is invalid. + ErrInvalidRequest = errors.New("Invalid request") + // ErrNoRewrite is returned if a call for value log GC doesn't result in a log file rewrite. + ErrNoRewrite = errors.New("Value log GC attempt didn't result in any cleanup") + + // ErrRejected is returned if a value log GC is called either while another GC is running, or + // after DB::Close has been called. + ErrRejected = errors.New("Value log GC request rejected") +) + +// Panic 如果err 不为nil 则panicc func Panic(err error) { if err != nil { panic(err) } } + +// Panic2 _ +func Panic2(_ interface{}, err error) { + Panic(err) +} + +// Err err +func Err(err error) error { + if err != nil { + fmt.Printf("%s %s\n", location(2, true), err) + } + return err +} + +// WarpErr err +func WarpErr(format string, err error) error { + if err != nil { + fmt.Printf("%s %s %s", format, location(2, true), err) + } + return err +} +func location(deep int, fullPath bool) string { + _, file, line, ok := runtime.Caller(deep) + if !ok { + file = "???" + line = 0 + } + + if fullPath { + if strings.HasPrefix(file, gopath) { + file = file[len(gopath):] + } + } else { + file = filepath.Base(file) + } + return file + ":" + strconv.Itoa(line) +} + +// CondPanic e +func CondPanic(condition bool, err error) { + if condition { + Panic(err) + } +} diff --git a/utils/file.go b/utils/file.go index 6b8a622..7a5fd8a 100644 --- a/utils/file.go +++ b/utils/file.go @@ -1,8 +1,127 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package utils -import "strings" +import ( + "bytes" + "fmt" + "hash/crc32" + "io/ioutil" + "os" + "path" + "path/filepath" + "strconv" + "strings" + + "github.com/pkg/errors" +) // FID 根据file name 获取其fid -func FID(name string) string { - return strings.Split(name, ".")[0] +func FID(name string) uint64 { + name = path.Base(name) + if !strings.HasSuffix(name, ".sst") { + return 0 + } + // suffix := name[len(fileSuffix):] + name = strings.TrimSuffix(name, ".sst") + id, err := strconv.Atoi(name) + if err != nil { + Err(err) + return 0 + } + return uint64(id) +} + +func VlogFilePath(dirPath string, fid uint32) string { + return fmt.Sprintf("%s%s%05d.vlog", dirPath, string(os.PathSeparator), fid) +} + +// CreateSyncedFile creates a new file (using O_EXCL), errors if it already existed. +func CreateSyncedFile(filename string, sync bool) (*os.File, error) { + flags := os.O_RDWR | os.O_CREATE | os.O_EXCL + if sync { + flags |= datasyncFileFlag + } + return os.OpenFile(filename, flags, 0600) +} + +// FileNameSSTable sst 文件名 +func FileNameSSTable(dir string, id uint64) string { + return filepath.Join(dir, fmt.Sprintf("%05d.sst", id)) } + +// openDir opens a directory for syncing. +func openDir(path string) (*os.File, error) { return os.Open(path) } + +// SyncDir When you create or delete a file, you have to ensure the directory entry for the file is synced +// in order to guarantee the file is visible (if the system crashes). (See the man page for fsync, +// or see https://github.com/coreos/etcd/issues/6368 for an example.) +func SyncDir(dir string) error { + f, err := openDir(dir) + if err != nil { + return errors.Wrapf(err, "While opening directory: %s.", dir) + } + err = f.Sync() + closeErr := f.Close() + if err != nil { + return errors.Wrapf(err, "While syncing directory: %s.", dir) + } + return errors.Wrapf(closeErr, "While closing directory: %s.", dir) +} + +// LoadIDMap Get the id of all sst files in the current folder +func LoadIDMap(dir string) map[uint64]struct{} { + fileInfos, err := ioutil.ReadDir(dir) + Err(err) + idMap := make(map[uint64]struct{}) + for _, info := range fileInfos { + if info.IsDir() { + continue + } + fileID := FID(info.Name()) + if fileID != 0 { + idMap[fileID] = struct{}{} + } + } + return idMap +} + +// CompareKeys checks the key without timestamp and checks the timestamp if keyNoTs +// is same. +// a would be sorted higher than aa if we use bytes.compare +// All keys should have timestamp. +func CompareKeys(key1, key2 []byte) int { + CondPanic((len(key1) <= 8 || len(key2) <= 8), fmt.Errorf("%s,%s < 8", string(key1), string(key2))) + if cmp := bytes.Compare(key1[:len(key1)-8], key2[:len(key2)-8]); cmp != 0 { + return cmp + } + return bytes.Compare(key1[len(key1)-8:], key2[len(key2)-8:]) +} + +// VerifyChecksum crc32 +func VerifyChecksum(data []byte, expected []byte) error { + actual := uint64(crc32.Checksum(data, CastagnoliCrcTable)) + expectedU64 := BytesToU64(expected) + if actual != expectedU64 { + return errors.Wrapf(ErrChecksumMismatch, "actual: %d, expected: %d", actual, expectedU64) + } + + return nil +} + +// CalculateChecksum _ +func CalculateChecksum(data []byte) uint64 { + return uint64(crc32.Checksum(data, CastagnoliCrcTable)) +} \ No newline at end of file diff --git a/utils/iterator.go b/utils/iterator.go new file mode 100644 index 0000000..377ee09 --- /dev/null +++ b/utils/iterator.go @@ -0,0 +1,37 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package utils + +// Iterator 迭代器 +type Iterator interface { + Next() + Valid() bool + Rewind() + Item() Item + Close() error + Seek(key []byte) +} + +// Item _ +type Item interface { + Entry() *Entry +} + +// Options _ +// TODO 可能被重构 +type Options struct { + Prefix []byte + IsAsc bool +} diff --git a/utils/key.go b/utils/key.go new file mode 100644 index 0000000..c92141a --- /dev/null +++ b/utils/key.go @@ -0,0 +1,90 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package utils + +import ( + "bytes" + "encoding/binary" + "math" + "time" + "unsafe" +) + +type stringStruct struct { + str unsafe.Pointer + len int +} + +//go:noescape +//go:linkname memhash runtime.memhash +func memhash(p unsafe.Pointer, h, s uintptr) uintptr + +// ParseKey parses the actual key from the key bytes. +func ParseKey(key []byte) []byte { + if len(key) < 8 { + return key + } + + return key[:len(key)-8] +} + +// ParseTs parses the timestamp from the key bytes. +func ParseTs(key []byte) uint64 { + if len(key) <= 8 { + return 0 + } + return math.MaxUint64 - binary.BigEndian.Uint64(key[len(key)-8:]) +} + +// SameKey checks for key equality ignoring the version timestamp suffix. +func SameKey(src, dst []byte) bool { + if len(src) != len(dst) { + return false + } + return bytes.Equal(ParseKey(src), ParseKey(dst)) +} + +// KeyWithTs generates a new key by appending ts to key. +func KeyWithTs(key []byte, ts uint64) []byte { + out := make([]byte, len(key)+8) + copy(out, key) + binary.BigEndian.PutUint64(out[len(key):], math.MaxUint64-ts) + return out +} + +// MemHash is the hash function used by go map, it utilizes available hardware instructions(behaves +// as aeshash if aes instruction is available). +// NOTE: The hash seed changes for every process. So, this cannot be used as a persistent hash. +func MemHash(data []byte) uint64 { + ss := (*stringStruct)(unsafe.Pointer(&data)) + return uint64(memhash(ss.str, 0, uintptr(ss.len))) +} + +// MemHashString is the hash function used by go map, it utilizes available hardware instructions +// (behaves as aeshash if aes instruction is available). +// NOTE: The hash seed changes for every process. So, this cannot be used as a persistent hash. +func MemHashString(str string) uint64 { + ss := (*stringStruct)(unsafe.Pointer(&str)) + return uint64(memhash(ss.str, 0, uintptr(ss.len))) +} + +// SafeCopy does append(a[:0], src...). +func SafeCopy(a, src []byte) []byte { + return append(a[:0], src...) +} + +func NewCurVersion() uint64 { + return uint64(time.Now().UnixNano() / 1e9) +} diff --git a/utils/map.go b/utils/map.go index 5f9b841..92ea95e 100644 --- a/utils/map.go +++ b/utils/map.go @@ -1,27 +1,83 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package utils -import "sync" +import ( + "reflect" + "sync" + + "github.com/pkg/errors" +) +// CoreMap _ type CoreMap struct { m sync.Map } -// NewMap +// NewMap _ func NewMap() *CoreMap { return &CoreMap{m: sync.Map{}} } -// Get +// Get _ func (c *CoreMap) Get(key interface{}) (interface{}, bool) { - return c.m.Load(key) + hashKey := c.keyToHash(key) + return c.m.Load(hashKey) } -// Set +// Set _ func (c *CoreMap) Set(key, value interface{}) { - c.m.Store(key, value) + hashKey := c.keyToHash(key) + c.m.Store(hashKey, value) +} + +// Del _ +func (c *CoreMap) Del(key interface{}) { + hashKey := c.keyToHash(key) + c.m.Delete(hashKey) } -// Range +// Range _ func (c *CoreMap) Range(f func(key, value interface{}) bool) { c.m.Range(f) } + +func (c *CoreMap) keyToHash(key interface{}) uint64 { + if key == nil { + return 0 + } + switch k := key.(type) { + case []byte: + return MemHash(k) + case uint32: + return uint64(k) + case string: + return MemHashString(k) + case uint64: + return k + case byte: + return uint64(k) + case int: + return uint64(k) + case int32: + return uint64(k) + + case int64: + return uint64(k) + default: + CondPanic(true, errors.Errorf("Key:[%+v] type not supported", reflect.TypeOf(k))) + } + return 0 +} diff --git a/utils/mmap/darwin.go b/utils/mmap/darwin.go new file mode 100644 index 0000000..836a81e --- /dev/null +++ b/utils/mmap/darwin.go @@ -0,0 +1,61 @@ +// +build darwin + +/* + * Copyright 2019 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package mmap + +import ( + "os" + "syscall" + "unsafe" + + "golang.org/x/sys/unix" +) + +// Mmap uses the mmap system call to memory-map a file. If writable is true, +// memory protection of the pages is set so that they may be written to as well. +func mmap(fd *os.File, writable bool, size int64) ([]byte, error) { + mtype := unix.PROT_READ + if writable { + mtype |= unix.PROT_WRITE + } + return unix.Mmap(int(fd.Fd()), 0, int(size), mtype, unix.MAP_SHARED) +} + +// Munmap unmaps a previously mapped slice. +func munmap(b []byte) error { + return unix.Munmap(b) +} + +// This is required because the unix package does not support the madvise system call on OS X. +func madvise(b []byte, readahead bool) error { + advice := unix.MADV_NORMAL + if !readahead { + advice = unix.MADV_RANDOM + } + + _, _, e1 := syscall.Syscall(syscall.SYS_MADVISE, uintptr(unsafe.Pointer(&b[0])), + uintptr(len(b)), uintptr(advice)) + if e1 != 0 { + return e1 + } + return nil +} + +func msync(b []byte) error { + return unix.Msync(b, unix.MS_SYNC) +} diff --git a/utils/mmap/linux.go b/utils/mmap/linux.go new file mode 100644 index 0000000..73b3ac8 --- /dev/null +++ b/utils/mmap/linux.go @@ -0,0 +1,97 @@ +// +build linux + +// Copyright 2021 logicrec Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mmap + +import ( + "os" + "reflect" + "unsafe" + + "golang.org/x/sys/unix" +) + +// mmap uses the mmap system call to memory-map a file. If writable is true, +// memory protection of the pages is set so that they may be written to as well. +func mmap(fd *os.File, writable bool, size int64) ([]byte, error) { + mtype := unix.PROT_READ + if writable { + mtype |= unix.PROT_WRITE + } + return unix.Mmap(int(fd.Fd()), 0, int(size), mtype, unix.MAP_SHARED) +} + +// mremap is a Linux-specific system call to remap pages in memory. This can be used in place of munmap + mmap. +func mremap(data []byte, size int) ([]byte, error) { + // taken from + const MREMAP_MAYMOVE = 0x1 + + header := (*reflect.SliceHeader)(unsafe.Pointer(&data)) + mmapAddr, _, errno := unix.Syscall6( + unix.SYS_MREMAP, + header.Data, + uintptr(header.Len), + uintptr(size), + uintptr(MREMAP_MAYMOVE), + 0, + 0, + ) + if errno != 0 { + return nil, errno + } + + header.Data = mmapAddr + header.Cap = size + header.Len = size + return data, nil +} + +// munmap unmaps a previously mapped slice. +// +// unix.Munmap maintains an internal list of mmapped addresses, and only calls munmap +// if the address is present in that list. If we use mremap, this list is not updated. +// To bypass this, we call munmap ourselves. +func munmap(data []byte) error { + if len(data) == 0 || len(data) != cap(data) { + return unix.EINVAL + } + _, _, errno := unix.Syscall( + unix.SYS_MUNMAP, + uintptr(unsafe.Pointer(&data[0])), + uintptr(len(data)), + 0, + ) + if errno != 0 { + return errno + } + return nil +} + +// madvise uses the madvise system call to give advise about the use of memory +// when using a slice that is memory-mapped to a file. Set the readahead flag to +// false if page references are expected in random order. +func madvise(b []byte, readahead bool) error { + flags := unix.MADV_NORMAL + if !readahead { + flags = unix.MADV_RANDOM + } + return unix.Madvise(b, flags) +} + +// msync writes any modified data to persistent storage. +func msync(b []byte) error { + return unix.Msync(b, unix.MS_SYNC) +} diff --git a/utils/mmap/mmap_darwin.go b/utils/mmap/mmap_darwin.go new file mode 100644 index 0000000..887db8d --- /dev/null +++ b/utils/mmap/mmap_darwin.go @@ -0,0 +1,45 @@ +// +build darwin + +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// mmap api + +// Mmap uses the mmap system call to memory-map a file. If writable is true, +// memory protection of the pages is set so that they may be written to as well. +package mmap + +import ( + "os" +) + +func Mmap(fd *os.File, writable bool, size int64) ([]byte, error) { + return mmap(fd, writable, size) +} + +// Munmap unmaps a previously mapped slice. +func Munmap(b []byte) error { + return munmap(b) +} + +// Madvise uses the madvise system call to give advise about the use of memory +// when using a slice that is memory-mapped to a file. Set the readahead flag to +// false if page references are expected in random order. +func Madvise(b []byte, readahead bool) error { + return madvise(b, readahead) +} + +// Msync would call sync on the mmapped data. +func Msync(b []byte) error { + return msync(b) +} diff --git a/utils/mmap/mmap_linux.go b/utils/mmap/mmap_linux.go new file mode 100644 index 0000000..6c0299a --- /dev/null +++ b/utils/mmap/mmap_linux.go @@ -0,0 +1,50 @@ +// +build linux + +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// mmap api + +// Mmap uses the mmap system call to memory-map a file. If writable is true, +// memory protection of the pages is set so that they may be written to as well. +package mmap + +import ( + "os" +) + +func Mmap(fd *os.File, writable bool, size int64) ([]byte, error) { + return mmap(fd, writable, size) +} + +// Munmap unmaps a previously mapped slice. +func Munmap(b []byte) error { + return munmap(b) +} + +// Madvise uses the madvise system call to give advise about the use of memory +// when using a slice that is memory-mapped to a file. Set the readahead flag to +// false if page references are expected in random order. +func Madvise(b []byte, readahead bool) error { + return madvise(b, readahead) +} + +// Msync would call sync on the mmapped data. +func Msync(b []byte) error { + return msync(b) +} + +// Mremap unmmap and mmap +func Mremap(data []byte, size int) ([]byte, error) { + return mremap(data, size) +} diff --git a/utils/rand.go b/utils/rand.go index 3f4ce13..c7229de 100644 --- a/utils/rand.go +++ b/utils/rand.go @@ -1,6 +1,7 @@ package utils import ( + "fmt" "math/rand" "sync" "time" @@ -31,3 +32,31 @@ func Float64() float64 { mu.Unlock() return res } + +// 生成随机字符串作为key和value +func randStr(length int) string { + // 包括特殊字符,进行测试 + str := "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ~=+%^*/()[]{}/!@#$?|©®😁😭🉑️🐂㎡硬核课堂" + bytes := []byte(str) + result := []byte{} + rand.Seed(time.Now().UnixNano() + int64(rand.Intn(100))) + for i := 0; i < length; i++ { + result = append(result, bytes[rand.Intn(len(bytes))]) + } + return string(result) +} + +// 构建entry对象 +func BuildEntry() *Entry { + rand.Seed(time.Now().Unix()) + key := []byte(fmt.Sprintf("%s%s", randStr(16), "12345678")) + value := []byte(randStr(128)) + // key := []byte(fmt.Sprintf("%s%s", "硬核课堂", "12345678")) + // value := []byte("硬核😁课堂") + expiresAt := uint64(time.Now().Add(12*time.Hour).UnixNano() / 1e6) + return &Entry{ + Key: key, + Value: value, + ExpiresAt: expiresAt, + } +} diff --git a/utils/skiplist.go b/utils/skiplist.go index 5a1153f..933cf66 100644 --- a/utils/skiplist.go +++ b/utils/skiplist.go @@ -1,274 +1,513 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* +Adapted from RocksDB inline skiplist. + +Key differences: +- No optimization for sequential inserts (no "prev"). +- No custom comparator. +- Support overwrites. This requires care when we see the same key when inserting. + For RocksDB or LevelDB, overwrites are implemented as a newer sequence number in the key, so + there is no need for values. We don't intend to support versioning. In-place updates of values + would be more efficient. +- We discard all non-concurrent code. +- We do not support Splices. This simplifies the code a lot. +- No AllocateNode or other pointer arithmetic. +- We combine the findLessThan, findGreaterOrEqual, etc into one function. +*/ + package utils import ( - "bytes" - "github.com/hardcore-os/corekv/iterator" - "math/rand" - "sync" - "time" - - "github.com/hardcore-os/corekv/utils/codec" + "github.com/pkg/errors" + "log" + "math" + "sync/atomic" + _ "unsafe" ) const ( - defaultMaxLevel = 48 + maxHeight = 20 + heightIncrease = math.MaxUint32 / 3 ) -type SkipList struct { - header *Element - - rand *rand.Rand - - maxLevel int - length int - lock sync.RWMutex - size int64 +type node struct { + // Multiple parts of the value are encoded as a single uint64 so that it + // can be atomically loaded and stored: + // value offset: uint32 (bits 0-31) + // value size : uint16 (bits 32-63) + value uint64 + + // A byte slice is 24 bytes. We are trying to save space here. + keyOffset uint32 // Immutable. No need to lock to access key. + keySize uint16 // Immutable. No need to lock to access key. + + // Height of the tower. + height uint16 + + // Most nodes do not need to use the full height of the tower, since the + // probability of each successive level decreases exponentially. Because + // these elements are never accessed, they do not need to be allocated. + // Therefore, when a node is allocated in the arena, its memory footprint + // is deliberately truncated to not include unneeded tower elements. + // + // All accesses to elements should use CAS operations, with no need to lock. + tower [maxHeight]uint32 } -func NewSkipList() *SkipList { - source := rand.NewSource(time.Now().UnixNano()) - - return &SkipList{ - header: &Element{ - levels: make([]*Element, defaultMaxLevel), - entry: nil, - score: 0, - }, - rand: rand.New(source), - maxLevel: defaultMaxLevel, - length: 0, - } +type Skiplist struct { + height int32 // Current height. 1 <= height <= kMaxHeight. CAS. + headOffset uint32 + ref int32 + arena *Arena + OnClose func() } -type Element struct { - levels []*Element - entry *codec.Entry - score float64 +// IncrRef increases the refcount +func (s *Skiplist) IncrRef() { + atomic.AddInt32(&s.ref, 1) } -func newElement(score float64, entry *codec.Entry, level int) *Element { - return &Element{ - levels: make([]*Element, level), - entry: entry, - score: score, +// DecrRef decrements the refcount, deallocating the Skiplist when done using it +func (s *Skiplist) DecrRef() { + newRef := atomic.AddInt32(&s.ref, -1) + if newRef > 0 { + return + } + if s.OnClose != nil { + s.OnClose() } -} -func (elem *Element) Entry() *codec.Entry { - return elem.entry + // Indicate we are closed. Good for testing. Also, lets GC reclaim memory. Race condition + // here would suggest we are accessing skiplist when we are supposed to have no reference! + s.arena = nil } -func (list *SkipList) Add(data *codec.Entry) error { - list.lock.Lock() - defer list.lock.Unlock() - score := list.calcScore(data.Key) - var elem *Element - - max := len(list.header.levels) - prevElem := list.header - - var prevElemHeaders [defaultMaxLevel]*Element - - for i := max - 1; i >= 0; { - //keep visit path here - prevElemHeaders[i] = prevElem +func newNode(arena *Arena, key []byte, v ValueStruct, height int) *node { + // The base level is already allocated in the node struct. + nodeOffset := arena.putNode(height) + keyOffset := arena.putKey(key) + val := encodeValue(arena.putVal(v), v.EncodedSize()) + + node := arena.getNode(nodeOffset) + node.keyOffset = keyOffset + node.keySize = uint16(len(key)) + node.height = uint16(height) + node.value = val + return node +} - for next := prevElem.levels[i]; next != nil; next = prevElem.levels[i] { - if comp := list.compare(score, data.Key, next); comp <= 0 { - if comp == 0 { - elem = next - elem.entry = data - list.size += elem.Entry().Size() - data.Size() - return nil - } +func encodeValue(valOffset uint32, valSize uint32) uint64 { + return uint64(valSize)<<32 | uint64(valOffset) +} - //find the insert position - break - } +func decodeValue(value uint64) (valOffset uint32, valSize uint32) { + valOffset = uint32(value) + valSize = uint32(value >> 32) + return +} - //just like linked-list next - prevElem = next - prevElemHeaders[i] = prevElem - } +// NewSkiplist makes a new empty skiplist, with a given arena size +func NewSkiplist(arenaSize int64) *Skiplist { + arena := newArena(arenaSize) + head := newNode(arena, nil, ValueStruct{}, maxHeight) + ho := arena.getNodeOffset(head) + return &Skiplist{ + height: 1, + headOffset: ho, + arena: arena, + ref: 1, + } +} - topLevel := prevElem.levels[i] +func (s *node) getValueOffset() (uint32, uint32) { + value := atomic.LoadUint64(&s.value) + return decodeValue(value) +} - //to skip same prevHeader's next and fill next elem into temp element - for i--; i >= 0 && prevElem.levels[i] == topLevel; i-- { - prevElemHeaders[i] = prevElem - } - } +func (s *node) key(arena *Arena) []byte { + return arena.getKey(s.keyOffset, s.keySize) +} - level := list.randLevel() +func (s *node) setValue(arena *Arena, vo uint64) { + atomic.StoreUint64(&s.value, vo) +} - elem = newElement(score, data, level) +func (s *node) getNextOffset(h int) uint32 { + return atomic.LoadUint32(&s.tower[h]) +} - //to add elem to the skiplist - for i := 0; i < level; i++ { - elem.levels[i] = prevElemHeaders[i].levels[i] - prevElemHeaders[i].levels[i] = elem - } - list.size += data.Size() - list.length++ - return nil +func (s *node) casNextOffset(h int, old, val uint32) bool { + return atomic.CompareAndSwapUint32(&s.tower[h], old, val) } -func (list *SkipList) Search(key []byte) (e *codec.Entry) { - list.lock.RLock() - defer list.lock.RUnlock() - if list.length == 0 { - return nil +// Returns true if key is strictly > n.key. +// If n is nil, this is an "end" marker and we return false. +//func (s *Skiplist) keyIsAfterNode(key []byte, n *node) bool { +// AssertTrue(n != s.head) +// return n != nil && CompareKeys(key, n.key) > 0 +//} + +func (s *Skiplist) randomHeight() int { + h := 1 + for h < maxHeight && FastRand() <= heightIncrease { + h++ } + return h +} - score := list.calcScore(key) +func (s *Skiplist) getNext(nd *node, height int) *node { + return s.arena.getNode(nd.getNextOffset(height)) +} - prevElem := list.header - i := len(list.header.levels) - 1 +func (s *Skiplist) getHead() *node { + return s.arena.getNode(s.headOffset) +} - for i >= 0 { - for next := prevElem.levels[i]; next != nil; next = prevElem.levels[i] { - if comp := list.compare(score, key, next); comp <= 0 { - if comp == 0 { - return next.Entry() - } - break +// findNear finds the node near to key. +// If less=true, it finds rightmost node such that node.key < key (if allowEqual=false) or +// node.key <= key (if allowEqual=true). +// If less=false, it finds leftmost node such that node.key > key (if allowEqual=false) or +// node.key >= key (if allowEqual=true). +// Returns the node found. The bool returned is true if the node has key equal to given key. +func (s *Skiplist) findNear(key []byte, less bool, allowEqual bool) (*node, bool) { + x := s.getHead() + level := int(s.getHeight() - 1) + for { + // Assume x.key < key. + next := s.getNext(x, level) + if next == nil { + // x.key < key < END OF LIST + if level > 0 { + // Can descend further to iterate closer to the end. + level-- + continue } - - prevElem = next + // Level=0. Cannot descend further. Let's return something that makes sense. + if !less { + return nil, false + } + // Try to return x. Make sure it is not a head node. + if x == s.getHead() { + return nil, false + } + return x, false } - topLevel := prevElem.levels[i] - - for i--; i >= 0 && prevElem.levels[i] == topLevel; i-- { + nextKey := next.key(s.arena) + cmp := CompareKeys(key, nextKey) + if cmp > 0 { + // x.key < next.key < key. We can continue to move right. + x = next + continue + } + if cmp == 0 { + // x.key < key == next.key. + if allowEqual { + return next, true + } + if !less { + // We want >, so go to base level to grab the next bigger note. + return s.getNext(next, 0), false + } + // We want <. If not base level, we should go closer in the next level. + if level > 0 { + level-- + continue + } + // On base level. Return x. + if x == s.getHead() { + return nil, false + } + return x, false + } + // cmp < 0. In other words, x.key < key < next. + if level > 0 { + level-- + continue + } + // At base level. Need to return something. + if !less { + return next, false + } + // Try to return x. Make sure it is not a head node. + if x == s.getHead() { + return nil, false + } + return x, false + } +} +// findSpliceForLevel returns (outBefore, outAfter) with outBefore.key <= key <= outAfter.key. +// The input "before" tells us where to start looking. +// If we found a node with the same key, then we return outBefore = outAfter. +// Otherwise, outBefore.key < key < outAfter.key. +func (s *Skiplist) findSpliceForLevel(key []byte, before uint32, level int) (uint32, uint32) { + for { + // Assume before.key < key. + beforeNode := s.arena.getNode(before) + next := beforeNode.getNextOffset(level) + nextNode := s.arena.getNode(next) + if nextNode == nil { + return before, next + } + nextKey := nextNode.key(s.arena) + cmp := CompareKeys(key, nextKey) + if cmp == 0 { + // Equality case. + return next, next + } + if cmp < 0 { + // before.key < key < next.key. We are done for this level. + return before, next } + before = next // Keep moving right on this level. } - return } -/*func (list *SkipList) Remove(key []byte) error { - score := list.calcScore(key) +func (s *Skiplist) getHeight() int32 { + return atomic.LoadInt32(&s.height) +} + +// Put inserts the key-value pair. +func (s *Skiplist) Add(e *Entry) { + // Since we allow overwrite, we may not need to create a new node. We might not even need to + // increase the height. Let's defer these actions. + key, v := e.Key, ValueStruct{ + Meta: e.Meta, + Value: e.Value, + ExpiresAt: e.ExpiresAt, + Version: e.Version, + } - max := len(list.header.levels) - prevElem := list.header + listHeight := s.getHeight() + var prev [maxHeight + 1]uint32 + var next [maxHeight + 1]uint32 + + prev[listHeight] = s.headOffset + for i := int(listHeight) - 1; i >= 0; i-- { + // Use higher level to speed up for current level. + prev[i], next[i] = s.findSpliceForLevel(key, prev[i+1], i) + if prev[i] == next[i] { + vo := s.arena.putVal(v) + encValue := encodeValue(vo, v.EncodedSize()) + prevNode := s.arena.getNode(prev[i]) + prevNode.setValue(s.arena, encValue) + return + } + } - var prevElemHeaders [defaultMaxLevel]*Element - var elem *Element + // We do need to create a new node. + height := s.randomHeight() + x := newNode(s.arena, key, v, height) - for i := max - 1; i >= 0; { - //keep visit path here - prevElemHeaders[i] = prevElem + // Try to increase s.height via CAS. + listHeight = s.getHeight() + for height > int(listHeight) { + if atomic.CompareAndSwapInt32(&s.height, listHeight, int32(height)) { + // Successfully increased skiplist.height. + break + } + listHeight = s.getHeight() + } - for next := prevElem.levels[i]; next != nil; next = prevElem.levels[i] { - if comp := list.compare(score, key, next); comp <= 0 { - if comp == 0 { - elem = next - } + // We always insert from the base level and up. After you add a node in base level, we cannot + // create a node in the level above because it would have discovered the node in the base level. + for i := 0; i < height; i++ { + for { + if s.arena.getNode(prev[i]) == nil { + AssertTrue(i > 1) // This cannot happen in base level. + // We haven't computed prev, next for this level because height exceeds old listHeight. + // For these levels, we expect the lists to be sparse, so we can just search from head. + prev[i], next[i] = s.findSpliceForLevel(key, s.headOffset, i) + // Someone adds the exact same key before we are able to do so. This can only happen on + // the base level. But we know we are not on the base level. + AssertTrue(prev[i] != next[i]) + } + x.tower[i] = next[i] + pnode := s.arena.getNode(prev[i]) + if pnode.casNextOffset(i, next[i], s.arena.getNodeOffset(x)) { + // Managed to insert x between prev[i] and next[i]. Go to the next level. break } - - //just like linked-list next - prevElem = next - prevElemHeaders[i] = prevElem + // CAS failed. We need to recompute prev and next. + // It is unlikely to be helpful to try to use a different level as we redo the search, + // because it is unlikely that lots of nodes are inserted between prev[i] and next[i]. + prev[i], next[i] = s.findSpliceForLevel(key, prev[i], i) + if prev[i] == next[i] { + AssertTruef(i == 0, "Equality can happen only on base level: %d", i) + vo := s.arena.putVal(v) + encValue := encodeValue(vo, v.EncodedSize()) + prevNode := s.arena.getNode(prev[i]) + prevNode.setValue(s.arena, encValue) + return + } } + } +} - topLevel := prevElem.levels[i] +// Empty returns if the Skiplist is empty. +func (s *Skiplist) Empty() bool { + return s.findLast() == nil +} - //to skip same prevHeader's next and fill next elem into temp element - for i--; i >= 0 && prevElem.levels[i] == topLevel; i-- { - prevElemHeaders[i] = prevElem +// findLast returns the last element. If head (empty list), we return nil. All the find functions +// will NEVER return the head nodes. +func (s *Skiplist) findLast() *node { + n := s.getHead() + level := int(s.getHeight()) - 1 + for { + next := s.getNext(n, level) + if next != nil { + n = next + continue } + if level == 0 { + if n == s.getHead() { + return nil + } + return n + } + level-- } +} - if elem == nil { - return nil +// Get gets the value associated with the key. It returns a valid value if it finds equal or earlier +// version of the same key. +func (s *Skiplist) Search(key []byte) ValueStruct { + n, _ := s.findNear(key, false, true) // findGreaterOrEqual. + if n == nil { + return ValueStruct{} } - prevTopLevel := len(elem.levels) - for i := 0; i < prevTopLevel; i++ { - prevElemHeaders[i].levels[i] = elem.levels[i] + nextKey := s.arena.getKey(n.keyOffset, n.keySize) + if !SameKey(key, nextKey) { + return ValueStruct{} } - list.length-- - return nil -}*/ + valOffset, valSize := n.getValueOffset() + vs := s.arena.getVal(valOffset, valSize) + vs.ExpiresAt = ParseTs(nextKey) + return vs +} -func (list *SkipList) Close() error { - return nil +// NewIterator returns a skiplist iterator. You have to Close() the iterator. +func (s *Skiplist) NewSkipListIterator() Iterator { + s.IncrRef() + return &SkipListIterator{list: s} } -func (list *SkipList) calcScore(key []byte) (score float64) { - var hash uint64 - l := len(key) +// MemSize returns the size of the Skiplist in terms of how much memory is used within its internal +// arena. +func (s *Skiplist) MemSize() int64 { return s.arena.size() } - if l > 8 { - l = 8 - } +// Iterator is an iterator over skiplist object. For new objects, you just +// need to initialize Iterator.list. +type SkipListIterator struct { + list *Skiplist + n *node +} + +func (s *SkipListIterator) Rewind() { + s.SeekToFirst() +} - for i := 0; i < l; i++ { - shift := uint(64 - 8 - i*8) - hash |= uint64(key[i]) << shift +func (s *SkipListIterator) Item() Item { + return &Entry{ + Key: s.Key(), + Value: s.Value().Value, + ExpiresAt: s.Value().ExpiresAt, + Meta: s.Value().Meta, + Version: s.Value().Version, } +} - score = float64(hash) - return +// Close frees the resources held by the iterator +func (s *SkipListIterator) Close() error { + s.list.DecrRef() + return nil } -func (list *SkipList) compare(score float64, key []byte, next *Element) int { - if score == next.score { - return bytes.Compare(key, next.entry.Key) - } +// Valid returns true iff the iterator is positioned at a valid node. +func (s *SkipListIterator) Valid() bool { return s.n != nil } - if score < next.score { - return -1 - } else { - return 1 - } +// Key returns the key at the current position. +func (s *SkipListIterator) Key() []byte { + //implement me here } -func (list *SkipList) randLevel() int { - if list.maxLevel <= 1 { - return 1 - } - i := 1 - for ; i < list.maxLevel; i++ { - if RandN(1000)%2 == 0 { - return i - } - } - return i +// Value returns value. +func (s *SkipListIterator) Value() ValueStruct { + //implement me here } -func (list *SkipList) Size() int64 { - return list.size +// ValueUint64 returns the uint64 value of the current node. +func (s *SkipListIterator) ValueUint64() uint64 { + return s.n.value } -type SkipListIter struct { - header *Element - elem *Element - lock sync.RWMutex +// Next advances to the next position. +func (s *SkipListIterator) Next() { + AssertTrue(s.Valid()) + s.n = s.list.getNext(s.n, 0) } -func (list *SkipList) NewSkipListIterator() iterator.Iterator { - return &SkipListIter{elem: list.header.levels[0], header: list.header} +// Prev advances to the previous position. +func (s *SkipListIterator) Prev() { + AssertTrue(s.Valid()) + s.n, _ = s.list.findNear(s.Key(), true, false) // find <. No equality allowed. } -func (iter *SkipListIter) Next() { - iter.lock.RLock() - defer iter.lock.RUnlock() - if iter.elem != nil { - iter.elem = iter.elem.levels[0] - } +// 找到 >= target 的第一个节点 +func (s *SkipListIterator) Seek(target []byte) { + //implement me here } -func (iter *SkipListIter) Valid() bool { - return iter.elem != nil + +// 找到 <= target 的第一个节点 +func (s *SkipListIterator) SeekForPrev(target []byte) { + //implement me here } -func (iter *SkipListIter) Rewind() { - iter.elem = iter.header + +//定位到链表的第一个节点 +func (s *SkipListIterator) SeekToFirst() { + //implement me here } -func (iter *SkipListIter) Item() iterator.Item { - return iter.elem + +// SeekToLast seeks position at the last entry in list. +// Final state of iterator is Valid() iff list is not empty. +func (s *SkipListIterator) SeekToLast() { + s.n = s.list.findLast() } -func (iter *SkipListIter) Close() error { - return nil + +// UniIterator is a unidirectional memtable iterator. It is a thin wrapper around +// Iterator. We like to keep Iterator as before, because it is more powerful and +// we might support bidirectional iterators in the future. +type UniIterator struct { + iter *Iterator + reversed bool +} + +// FastRand is a fast thread local random function. +//go:linkname FastRand runtime.fastrand +func FastRand() uint32 + +// AssertTruef is AssertTrue with extra info. +func AssertTruef(b bool, format string, args ...interface{}) { + if !b { + log.Fatalf("%+v", errors.Errorf(format, args...)) + } } diff --git a/utils/skiplist_test.go b/utils/skiplist_test.go index 89014c8..481fd1a 100644 --- a/utils/skiplist_test.go +++ b/utils/skiplist_test.go @@ -1,12 +1,26 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package utils import ( "fmt" - "github.com/hardcore-os/corekv/utils/codec" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" "sync" "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func RandString(len int) string { @@ -18,79 +32,55 @@ func RandString(len int) string { return string(bytes) } -func TestSkipList_compare(t *testing.T) { - list := SkipList{ - header: nil, - rand: nil, - maxLevel: 0, - length: 0, - } - - byte1 := []byte("1") - byte2 := []byte("2") - entry1 := codec.NewEntry(byte1, byte1) - - byte1score := list.calcScore(byte1) - byte2score := list.calcScore(byte2) - - elem := &Element{ - levels: nil, - entry: entry1, - score: byte2score, - } - - assert.Equal(t, list.compare(byte1score, byte1, elem), -1) -} - func TestSkipListBasicCRUD(t *testing.T) { - list := NewSkipList() + list := NewSkiplist(1000) //Put & Get - entry1 := codec.NewEntry([]byte("Key1"), []byte("Val1")) - assert.Nil(t, list.Add(entry1)) - assert.Equal(t, entry1.Value, list.Search(entry1.Key).Value) + entry1 := NewEntry([]byte(RandString(10)), []byte("Val1")) + list.Add(entry1) + vs := list.Search(entry1.Key) + assert.Equal(t, entry1.Value, vs.Value) - entry2 := codec.NewEntry([]byte("Key2"), []byte("Val2")) - assert.Nil(t, list.Add(entry2)) - assert.Equal(t, entry2.Value, list.Search(entry2.Key).Value) + entry2 := NewEntry([]byte(RandString(10)), []byte("Val2")) + list.Add(entry2) + vs = list.Search(entry2.Key) + assert.Equal(t, entry2.Value, vs.Value) //Get a not exist entry - assert.Nil(t, list.Search([]byte("noexist"))) + assert.Nil(t, list.Search([]byte(RandString(10))).Value) //Update a entry - entry2_new := codec.NewEntry([]byte("Key1"), []byte("Val1+1")) - assert.Nil(t, list.Add(entry2_new)) + entry2_new := NewEntry([]byte(RandString(10)), []byte("Val1+1")) + list.Add(entry2_new) assert.Equal(t, entry2_new.Value, list.Search(entry2_new.Key).Value) } func Benchmark_SkipListBasicCRUD(b *testing.B) { - list := NewSkipList() + list := NewSkiplist(100000000) key, val := "", "" - maxTime := 1000000 + maxTime := 1000 for i := 0; i < maxTime; i++ { //number := rand.Intn(10000) - key, val = fmt.Sprintf("Key%d", i), fmt.Sprintf("Val%d", i) - entry := codec.NewEntry([]byte(key), []byte(val)) - res := list.Add(entry) - assert.Equal(b, res, nil) + key, val = RandString(10), fmt.Sprintf("Val%d", i) + entry := NewEntry([]byte(key), []byte(val)) + list.Add(entry) searchVal := list.Search([]byte(key)) assert.Equal(b, searchVal.Value, []byte(val)) - } } func TestConcurrentBasic(t *testing.T) { const n = 1000 - l := NewSkipList() + l := NewSkiplist(100000000) var wg sync.WaitGroup key := func(i int) []byte { - return []byte(fmt.Sprintf("%05d", i)) + return []byte(fmt.Sprintf("Keykeykey%05d", i)) } for i := 0; i < n; i++ { wg.Add(1) go func(i int) { defer wg.Done() - assert.Nil(t, l.Add(codec.NewEntry(key(i), key(i)))) + l.Add(NewEntry(key(i), key(i))) }(i) } wg.Wait() @@ -101,10 +91,9 @@ func TestConcurrentBasic(t *testing.T) { go func(i int) { defer wg.Done() v := l.Search(key(i)) - if v != nil { - require.EqualValues(t, key(i), v.Value) - return - } + require.EqualValues(t, key(i), v.Value) + return + require.Nil(t, v) }(i) } @@ -113,16 +102,16 @@ func TestConcurrentBasic(t *testing.T) { func Benchmark_ConcurrentBasic(b *testing.B) { const n = 1000 - l := NewSkipList() + l := NewSkiplist(100000000) var wg sync.WaitGroup key := func(i int) []byte { - return []byte(fmt.Sprintf("%05d", i)) + return []byte(fmt.Sprintf("keykeykey%05d", i)) } for i := 0; i < n; i++ { wg.Add(1) go func(i int) { defer wg.Done() - assert.Nil(b, l.Add(codec.NewEntry(key(i), key(i)))) + l.Add(NewEntry(key(i), key(i))) }(i) } wg.Wait() @@ -133,12 +122,32 @@ func Benchmark_ConcurrentBasic(b *testing.B) { go func(i int) { defer wg.Done() v := l.Search(key(i)) - if v != nil { - require.EqualValues(b, key(i), v.Value) - return - } + require.EqualValues(b, key(i), v.Value) require.Nil(b, v) }(i) } wg.Wait() } + +func TestSkipListIterator(t *testing.T) { + list := NewSkiplist(100000) + + //Put & Get + entry1 := NewEntry([]byte(RandString(10)), []byte(RandString(10))) + list.Add(entry1) + assert.Equal(t, entry1.Value, list.Search(entry1.Key).Value) + + entry2 := NewEntry([]byte(RandString(10)), []byte(RandString(10))) + list.Add(entry2) + assert.Equal(t, entry2.Value, list.Search(entry2.Key).Value) + + //Update a entry + entry2_new := NewEntry([]byte(RandString(10)), []byte(RandString(10))) + list.Add(entry2_new) + assert.Equal(t, entry2_new.Value, list.Search(entry2_new.Key).Value) + + iter := list.NewSkipListIterator() + for iter.Rewind(); iter.Valid(); iter.Next() { + fmt.Printf("iter key %s, value %s", iter.Item().Entry().Key, iter.Item().Entry().Value) + } +} diff --git a/utils/slice.go b/utils/slice.go new file mode 100644 index 0000000..c453ae6 --- /dev/null +++ b/utils/slice.go @@ -0,0 +1,7 @@ +package utils + +// Slice holds a reusable buf, will reallocate if you request a larger size than ever before. +// One problem is with n distinct sizes in random order it'll reallocate log(n) times. +type Slice struct { + buf []byte +} diff --git a/utils/throttle.go b/utils/throttle.go new file mode 100644 index 0000000..c311408 --- /dev/null +++ b/utils/throttle.go @@ -0,0 +1,85 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package utils + +import "sync" + +// Throttle allows a limited number of workers to run at a time. It also +// provides a mechanism to check for errors encountered by workers and wait for +// them to finish. +type Throttle struct { + once sync.Once + wg sync.WaitGroup + ch chan struct{} + errCh chan error + finishErr error +} + +// NewThrottle creates a new throttle with a max number of workers. +func NewThrottle(max int) *Throttle { + return &Throttle{ + ch: make(chan struct{}, max), + errCh: make(chan error, max), + } +} + +// Do should be called by workers before they start working. It blocks if there +// are already maximum number of workers working. If it detects an error from +// previously Done workers, it would return it. +func (t *Throttle) Do() error { + for { + select { + case t.ch <- struct{}{}: + t.wg.Add(1) + return nil + case err := <-t.errCh: + if err != nil { + return err + } + } + } +} + +// Done should be called by workers when they finish working. They can also +// pass the error status of work done. +func (t *Throttle) Done(err error) { + if err != nil { + t.errCh <- err + } + select { + case <-t.ch: + default: + panic("Throttle Do Done mismatch") + } + t.wg.Done() +} + +// Finish waits until all workers have finished working. It would return any error passed by Done. +// If Finish is called multiple time, it will wait for workers to finish only once(first time). +// From next calls, it will return same error as found on first call. +func (t *Throttle) Finish() error { + t.once.Do(func() { + t.wg.Wait() + close(t.ch) + close(t.errCh) + for err := range t.errCh { + if err != nil { + t.finishErr = err + return + } + } + }) + + return t.finishErr +} diff --git a/utils/tools.go b/utils/tools.go index 8efd417..8d68d6f 100644 --- a/utils/tools.go +++ b/utils/tools.go @@ -1,5 +1,26 @@ +// Copyright 2021 bardcckre-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package utils func ValueSize(value []byte) int64 { return 0 } + +// Copy copies a byte slice and returns the copied slice. +func Copy(a []byte) []byte { + b := make([]byte, len(a)) + copy(b, a) + return b +} diff --git a/utils/value.go b/utils/value.go new file mode 100644 index 0000000..38ab0f9 --- /dev/null +++ b/utils/value.go @@ -0,0 +1,159 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package utils + +import ( + "encoding/binary" + "reflect" + "time" + "unsafe" +) + +const ( + // size of vlog header. + // +----------------+------------------+ + // | keyID(8 bytes) | baseIV(12 bytes)| + // +----------------+------------------+ + ValueLogHeaderSize = 20 + vptrSize = unsafe.Sizeof(ValuePtr{}) +) + +type ValuePtr struct { + Len uint32 + Offset uint32 + Fid uint32 +} + +func (p ValuePtr) Less(o *ValuePtr) bool { + if o == nil { + return false + } + if p.Fid != o.Fid { + return p.Fid < o.Fid + } + if p.Offset != o.Offset { + return p.Offset < o.Offset + } + return p.Len < o.Len +} + +func (p ValuePtr) IsZero() bool { + return p.Fid == 0 && p.Offset == 0 && p.Len == 0 +} + +// Encode encodes Pointer into byte buffer. +func (p ValuePtr) Encode() []byte { + b := make([]byte, vptrSize) + // Copy over the content from p to b. + *(*ValuePtr)(unsafe.Pointer(&b[0])) = p + return b +} + +// Decode decodes the value pointer into the provided byte buffer. +func (p *ValuePtr) Decode(b []byte) { + // Copy over data from b into p. Using *p=unsafe.pointer(...) leads to + copy(((*[vptrSize]byte)(unsafe.Pointer(p))[:]), b[:vptrSize]) +} +func IsValuePtr(e *Entry) bool { + return e.Meta&BitValuePointer > 0 +} + +// BytesToU32 converts the given byte slice to uint32 +func BytesToU32(b []byte) uint32 { + return binary.BigEndian.Uint32(b) +} + +// BytesToU64 _ +func BytesToU64(b []byte) uint64 { + return binary.BigEndian.Uint64(b) +} + +// U32SliceToBytes converts the given Uint32 slice to byte slice +func U32SliceToBytes(u32s []uint32) []byte { + if len(u32s) == 0 { + return nil + } + var b []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + hdr.Len = len(u32s) * 4 + hdr.Cap = hdr.Len + hdr.Data = uintptr(unsafe.Pointer(&u32s[0])) + return b +} + +// U32ToBytes converts the given Uint32 to bytes +func U32ToBytes(v uint32) []byte { + var uBuf [4]byte + binary.BigEndian.PutUint32(uBuf[:], v) + return uBuf[:] +} + +// U64ToBytes converts the given Uint64 to bytes +func U64ToBytes(v uint64) []byte { + var uBuf [8]byte + binary.BigEndian.PutUint64(uBuf[:], v) + return uBuf[:] +} + +// BytesToU32Slice converts the given byte slice to uint32 slice +func BytesToU32Slice(b []byte) []uint32 { + if len(b) == 0 { + return nil + } + var u32s []uint32 + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&u32s)) + hdr.Len = len(b) / 4 + hdr.Cap = hdr.Len + hdr.Data = uintptr(unsafe.Pointer(&b[0])) + return u32s +} + +// ValuePtrCodec _ +func ValuePtrCodec(vp *ValuePtr) []byte { + return []byte{} +} + +// RunCallback _ +func RunCallback(cb func()) { + if cb != nil { + cb() + } +} + +func IsDeletedOrExpired(meta byte, expiresAt uint64) bool { + if meta&BitDelete > 0 { + return true + } + if expiresAt == 0 { + return false + } + return expiresAt <= uint64(time.Now().Unix()) +} + +func DiscardEntry(e, vs *Entry) bool { + // TODO 版本这个信息应该被弱化掉 在后面上MVCC或者多版本查询的时候再考虑 + // if vs.Version != ParseTs(e.Key) { + // // Version not found. Discard. + // return true + // } + if IsDeletedOrExpired(vs.Meta, vs.ExpiresAt) { + return true + } + if (vs.Meta & BitValuePointer) == 0 { + // Key also stores the value in LSM. Discard. + return true + } + return false +} diff --git a/utils/wal.go b/utils/wal.go new file mode 100644 index 0000000..a31daff --- /dev/null +++ b/utils/wal.go @@ -0,0 +1,155 @@ +// Copyright 2021 logicrec Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package utils + +import ( + "bytes" + "encoding/binary" + "hash" + "hash/crc32" + "io" +) + +// LogEntry +type LogEntry func(e *Entry, vp *ValuePtr) error + +type WalHeader struct { + KeyLen uint32 + ValueLen uint32 + Meta byte + ExpiresAt uint64 +} + +const maxHeaderSize int = 21 + +func (h WalHeader) Encode(out []byte) int { + index := 0 + index = binary.PutUvarint(out[index:], uint64(h.KeyLen)) + index += binary.PutUvarint(out[index:], uint64(h.ValueLen)) + index += binary.PutUvarint(out[index:], uint64(h.Meta)) + index += binary.PutUvarint(out[index:], h.ExpiresAt) + return index +} + +func (h *WalHeader) Decode(reader *HashReader) (int, error) { + var err error + + klen, err := binary.ReadUvarint(reader) + if err != nil { + return 0, err + } + h.KeyLen = uint32(klen) + + vlen, err := binary.ReadUvarint(reader) + if err != nil { + return 0, err + } + h.ValueLen = uint32(vlen) + + meta, err := binary.ReadUvarint(reader) + if err != nil { + return 0, err + } + h.Meta = byte(meta) + h.ExpiresAt, err = binary.ReadUvarint(reader) + if err != nil { + return 0, err + } + return reader.BytesRead, nil +} + +// WalCodec 写入wal文件的编码 +// | header | key | value | crc32 | +func WalCodec(buf *bytes.Buffer, e *Entry) int { + buf.Reset() + h := WalHeader{ + KeyLen: uint32(len(e.Key)), + ValueLen: uint32(len(e.Value)), + ExpiresAt: e.ExpiresAt, + } + + hash := crc32.New(CastagnoliCrcTable) + writer := io.MultiWriter(buf, hash) + + // encode header. + var headerEnc [maxHeaderSize]byte + sz := h.Encode(headerEnc[:]) + Panic2(writer.Write(headerEnc[:sz])) + Panic2(writer.Write(e.Key)) + Panic2(writer.Write(e.Value)) + // write crc32 hash. + var crcBuf [crc32.Size]byte + binary.BigEndian.PutUint32(crcBuf[:], hash.Sum32()) + Panic2(buf.Write(crcBuf[:])) + // return encoded length. + return len(headerEnc[:sz]) + len(e.Key) + len(e.Value) + len(crcBuf) +} + +// EstimateWalCodecSize 预估当前kv 写入wal文件占用的空间大小 +func EstimateWalCodecSize(e *Entry) int { + return len(e.Key) + len(e.Value) + 8 /* ExpiresAt uint64 */ + + crc32.Size + maxHeaderSize +} + +type HashReader struct { + R io.Reader + H hash.Hash32 + BytesRead int // Number of bytes read. +} + +func NewHashReader(r io.Reader) *HashReader { + hash := crc32.New(CastagnoliCrcTable) + return &HashReader{ + R: r, + H: hash, + } +} + +// Read reads len(p) bytes from the reader. Returns the number of bytes read, error on failure. +func (t *HashReader) Read(p []byte) (int, error) { + n, err := t.R.Read(p) + if err != nil { + return n, err + } + t.BytesRead += n + return t.H.Write(p[:n]) +} + +// ReadByte reads exactly one byte from the reader. Returns error on failure. +func (t *HashReader) ReadByte() (byte, error) { + b := make([]byte, 1) + _, err := t.Read(b) + return b[0], err +} + +// Sum32 returns the sum32 of the underlying hash. +func (t *HashReader) Sum32() uint32 { + return t.H.Sum32() +} + +// IsZero _ +func (e *Entry) IsZero() bool { + return len(e.Key) == 0 +} + +// LogHeaderLen _ +func (e *Entry) LogHeaderLen() int { + return e.Hlen +} + +// LogOffset _ +func (e *Entry) LogOffset() uint32 { + return e.Offset +} diff --git a/vlog.go b/vlog.go new file mode 100644 index 0000000..4a0a2a7 --- /dev/null +++ b/vlog.go @@ -0,0 +1,1270 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package corekv + +import ( + "bufio" + "bytes" + "encoding/json" + "fmt" + "hash/crc32" + "io" + "io/ioutil" + "math" + "math/rand" + "os" + "sort" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/hardcore-os/corekv/file" + "github.com/hardcore-os/corekv/utils" + "github.com/pkg/errors" +) + +const discardStatsFlushThreshold = 100 + +var lfDiscardStatsKey = []byte("!corekv!discard") // For storing lfDiscardStats + +// valueLog +type valueLog struct { + dirPath string + + // guards our view of which files exist, which to be deleted, how many active iterators + filesLock sync.RWMutex + filesMap map[uint32]*file.LogFile + maxFid uint32 + filesToBeDeleted []uint32 + // A refcount of iterators -- when this hits zero, we can delete the filesToBeDeleted. + numActiveIterators int32 + + db *DB + writableLogOffset uint32 // read by read, written by write. Must access via atomics. + numEntriesWritten uint32 + opt Options + + garbageCh chan struct{} + lfDiscardStats *lfDiscardStats +} + +func (vlog *valueLog) newValuePtr(e *utils.Entry) (*utils.ValuePtr, error) { + // TODO 尝试使用对象复用,后面entry对象也应该使用 + req := requestPool.Get().(*request) + req.reset() + req.Entries = []*utils.Entry{e} + req.Wg.Add(1) + req.IncrRef() // for db write + defer req.DecrRef() + err := vlog.write([]*request{req}) + return req.Ptrs[0], err +} +func (vlog *valueLog) open(db *DB, ptr *utils.ValuePtr, replayFn utils.LogEntry) error { + vlog.lfDiscardStats.closer.Add(1) + go vlog.flushDiscardStats() + if err := vlog.populateFilesMap(); err != nil { + return err + } + // If no files are found, then create a new file. + if len(vlog.filesMap) == 0 { + _, err := vlog.createVlogFile(0) + return utils.WarpErr("Error while creating log file in valueLog.open", err) + } + fids := vlog.sortedFids() + for _, fid := range fids { + lf, ok := vlog.filesMap[fid] + utils.CondPanic(!ok, fmt.Errorf("vlog.filesMap[fid] fid not found")) + var err error + if err = lf.Open( + &file.Options{ + FID: uint64(fid), + FileName: vlog.fpath(fid), + Dir: vlog.dirPath, + Path: vlog.dirPath, + MaxSz: 2 * vlog.db.opt.ValueLogFileSize, + }); err != nil { + return errors.Wrapf(err, "Open existing file: %q", lf.FileName()) + } + var offset uint32 + // 从head处开始重放vlog日志,而不是从第一条日志 + // head 相当于一个快照 + if fid == ptr.Fid { + offset = ptr.Offset + ptr.Len + } + fmt.Printf("Replaying file id: %d at offset: %d\n", fid, offset) + now := time.Now() + // 重放日志 + if err := vlog.replayLog(lf, offset, replayFn); err != nil { + // Log file is corrupted. Delete it. + if err == utils.ErrDeleteVlogFile { + delete(vlog.filesMap, fid) + // Close the fd of the file before deleting the file otherwise windows complaints. + if err := lf.Close(); err != nil { + return errors.Wrapf(err, "failed to close vlog file %s", lf.FileName()) + } + path := vlog.fpath(lf.FID) + if err := os.Remove(path); err != nil { + return errors.Wrapf(err, "failed to delete empty value log file: %q", path) + } + continue + } + return err + } + fmt.Printf("Replay took: %s\n", time.Since(now)) + + if fid < vlog.maxFid { + // This file has been replayed. It can now be mmapped. + // For maxFid, the mmap would be done by the specially written code below. + if err := lf.Init(); err != nil { + return err + } + } + } + // Seek to the end to start writing. + last, ok := vlog.filesMap[vlog.maxFid] + utils.CondPanic(!ok, errors.New("vlog.filesMap[vlog.maxFid] not found")) + lastOffset, err := last.Seek(0, io.SeekEnd) + if err != nil { + return errors.Wrapf(err, fmt.Sprintf("file.Seek to end path:[%s]", last.FileName())) + } + vlog.writableLogOffset = uint32(lastOffset) + + // head的设计起到check point的作用 + vlog.db.vhead = &utils.ValuePtr{Fid: vlog.maxFid, Offset: uint32(lastOffset)} + if err := vlog.populateDiscardStats(); err != nil { + fmt.Errorf("Failed to populate discard stats: %s\n", err) + } + return nil +} + +// Read reads the value log at a given location. +// TODO: Make this read private. +func (vlog *valueLog) read(vp *utils.ValuePtr) ([]byte, func(), error) { + buf, lf, err := vlog.readValueBytes(vp) + // log file is locked so, decide whether to lock immediately or let the caller to + // unlock it, after caller uses it. + cb := vlog.getUnlockCallback(lf) + if err != nil { + return nil, cb, err + } + + if vlog.opt.VerifyValueChecksum { + hash := crc32.New(utils.CastagnoliCrcTable) + if _, err := hash.Write(buf[:len(buf)-crc32.Size]); err != nil { + utils.RunCallback(cb) + return nil, nil, errors.Wrapf(err, "failed to write hash for vp %+v", vp) + } + // Fetch checksum from the end of the buffer. + checksum := buf[len(buf)-crc32.Size:] + if hash.Sum32() != utils.BytesToU32(checksum) { + utils.RunCallback(cb) + return nil, nil, errors.Wrapf(utils.ErrChecksumMismatch, "value corrupted for vp: %+v", vp) + } + } + var h utils.Header + headerLen := h.Decode(buf) + kv := buf[headerLen:] + if uint32(len(kv)) < h.KLen+h.VLen { + fmt.Errorf("Invalid read: vp: %+v\n", vp) + return nil, nil, errors.Errorf("Invalid read: Len: %d read at:[%d:%d]", + len(kv), h.KLen, h.KLen+h.VLen) + } + return kv[h.KLen : h.KLen+h.VLen], cb, nil +} + +// write 并不是并发安全的 +func (vlog *valueLog) write(reqs []*request) error { + // 需要检查是否能够正确写入 + if err := vlog.validateWrites(reqs); err != nil { + return err + } + + vlog.filesLock.RLock() + maxFid := vlog.maxFid + curlf := vlog.filesMap[maxFid] + vlog.filesLock.RUnlock() + + var buf bytes.Buffer + flushWrites := func() error { + if buf.Len() == 0 { + return nil + } + data := buf.Bytes() + offset := vlog.woffset() + if err := curlf.Write(offset, data); err != nil { + return errors.Wrapf(err, "Unable to write to value log file: %q", curlf.FileName()) + } + buf.Reset() + atomic.AddUint32(&vlog.writableLogOffset, uint32(len(data))) + curlf.AddSize(vlog.writableLogOffset) + return nil + } + toDisk := func() error { + if err := flushWrites(); err != nil { + return err + } + // 切分vlog文件 + if vlog.woffset() > uint32(vlog.opt.ValueLogFileSize) || + vlog.numEntriesWritten > vlog.opt.ValueLogMaxEntries { + if err := curlf.DoneWriting(vlog.woffset()); err != nil { + return err + } + + newid := atomic.AddUint32(&vlog.maxFid, 1) + utils.CondPanic(newid <= 0, fmt.Errorf("newid has overflown uint32: %v", newid)) + newlf, err := vlog.createVlogFile(newid) + if err != nil { + return err + } + curlf = newlf + atomic.AddInt32(&vlog.db.logRotates, 1) + } + return nil + } + for i := range reqs { + b := reqs[i] + b.Ptrs = b.Ptrs[:0] + var written int + for j := range b.Entries { + e := b.Entries[j] + if vlog.db.shouldWriteValueToLSM(e) { + b.Ptrs = append(b.Ptrs, &utils.ValuePtr{}) + continue + } + var p utils.ValuePtr + + p.Fid = curlf.FID + // Use the offset including buffer length so far. + p.Offset = vlog.woffset() + uint32(buf.Len()) + plen, err := curlf.EncodeEntry(e, &buf, p.Offset) // Now encode the entry into buffer. + if err != nil { + return err + } + p.Len = uint32(plen) + b.Ptrs = append(b.Ptrs, &p) + written++ + + if buf.Len() > vlog.db.opt.ValueLogFileSize { + if err := flushWrites(); err != nil { + return err + } + } + } + vlog.numEntriesWritten += uint32(written) + // We write to disk here so that all entries that are part of the same transaction are + // written to the same vlog file. + writeNow := + vlog.woffset()+uint32(buf.Len()) > uint32(vlog.opt.ValueLogFileSize) || + vlog.numEntriesWritten > uint32(vlog.opt.ValueLogMaxEntries) + if writeNow { + if err := toDisk(); err != nil { + return err + } + } + } + return toDisk() +} + +func (vlog *valueLog) close() error { + if vlog == nil || vlog.db == nil { + return nil + } + // close flushDiscardStats. + <-vlog.lfDiscardStats.closer.CloseSignal + var err error + for id, f := range vlog.filesMap { + f.Lock.Lock() // We won’t release the lock. + maxFid := vlog.maxFid + // TODO(ibrahim) - Do we need the following truncations on non-windows + // platforms? We expand the file only on windows and the vlog.woffset() + // should point to end of file on all other platforms. + if id == maxFid { + // truncate writable log file to correct offset. + if truncErr := f.Truncate(int64(vlog.woffset())); truncErr != nil && err == nil { + err = truncErr + } + } + if closeErr := f.Close(); closeErr != nil && err == nil { + err = closeErr + } + f.Lock.Unlock() + } + return err +} + +func (vlog *valueLog) runGC(discardRatio float64, head *utils.ValuePtr) error { + select { + case vlog.garbageCh <- struct{}{}: + // Pick a log file for GC. + defer func() { + // 通过一个channel来控制一次仅运行一个GC任务 + <-vlog.garbageCh + }() + + var err error + files := vlog.pickLog(head) + if len(files) == 0 { + return utils.ErrNoRewrite + } + tried := make(map[uint32]bool) + for _, lf := range files { + //消重一下,防止随机策略和统计策略返回同一个fid + if _, done := tried[lf.FID]; done { + continue + } + tried[lf.FID] = true + if err = vlog.doRunGC(lf, discardRatio); err == nil { + return nil + } + } + return err + default: + return utils.ErrRejected + } +} + +func (vlog *valueLog) doRunGC(lf *file.LogFile, discardRatio float64) (err error) { + // 退出的时候把统计的discard清空 + defer func() { + if err == nil { + vlog.lfDiscardStats.Lock() + delete(vlog.lfDiscardStats.m, lf.FID) + vlog.lfDiscardStats.Unlock() + } + }() + s := &sampler{ + lf: lf, + countRatio: 0.01, // 1% of num entries. + sizeRatio: 0.1, // 10% of the file as window. + fromBeginning: false, + } + + if _, err = vlog.sample(s, discardRatio); err != nil { + return err + } + + if err = vlog.rewrite(lf); err != nil { + return err + } + return nil +} + +//重写 +func (vlog *valueLog) rewrite(f *file.LogFile) error { + vlog.filesLock.RLock() + maxFid := vlog.maxFid + vlog.filesLock.RUnlock() + utils.CondPanic(uint32(f.FID) >= maxFid, fmt.Errorf("fid to move: %d. Current max fid: %d", f.FID, maxFid)) + + wb := make([]*utils.Entry, 0, 1000) + var size int64 + + var count, moved int + fe := func(e *utils.Entry) error { + count++ + if count%100000 == 0 { + fmt.Printf("Processing entry %d\n", count) + } + + vs, err := vlog.db.lsm.Get(e.Key) + if err != nil { + return err + } + if utils.DiscardEntry(e, vs) { + return nil + } + + if len(vs.Value) == 0 { + return errors.Errorf("Empty value: %+v", vs) + } + var vp utils.ValuePtr + vp.Decode(vs.Value) + + if vp.Fid > f.FID { + return nil + } + if vp.Offset > e.Offset { + return nil + } + // 如果从lsm和vlog的同一个位置读取带entry则重新写回,也有可能读取到旧的 + if vp.Fid == f.FID && vp.Offset == e.Offset { + moved++ + // This new entry only contains the key, and a pointer to the value. + ne := new(utils.Entry) + ne.Meta = 0 // Remove all bits. Different keyspace doesn't need these bits. + ne.ExpiresAt = e.ExpiresAt + ne.Key = append([]byte{}, e.Key...) + ne.Value = append([]byte{}, e.Value...) + es := int64(ne.EstimateSize(vlog.db.opt.ValueLogFileSize)) + // Consider size of value as well while considering the total size + // of the batch. There have been reports of high memory usage in + // rewrite because we don't consider the value size. See #1292. + es += int64(len(e.Value)) + + // Ensure length and size of wb is within transaction limits. + if int64(len(wb)+1) >= vlog.opt.MaxBatchCount || + size+es >= vlog.opt.MaxBatchSize { + if err := vlog.db.batchSet(wb); err != nil { + return err + } + size = 0 + wb = wb[:0] + } + wb = append(wb, ne) + size += es + } + return nil + } + + _, err := vlog.iterate(f, 0, func(e *utils.Entry, vp *utils.ValuePtr) error { + return fe(e) + }) + if err != nil { + return err + } + + batchSize := 1024 + var loops int + for i := 0; i < len(wb); { + loops++ + if batchSize == 0 { + return utils.ErrNoRewrite + } + end := i + batchSize + if end > len(wb) { + end = len(wb) + } + if err := vlog.db.batchSet(wb[i:end]); err != nil { + if err == utils.ErrTxnTooBig { + // Decrease the batch size to half. + batchSize = batchSize / 2 + continue + } + return err + } + i += batchSize + } + var deleteFileNow bool + // Entries written to LSM. Remove the older file now. + { + vlog.filesLock.Lock() + // Just a sanity-check. + if _, ok := vlog.filesMap[f.FID]; !ok { + vlog.filesLock.Unlock() + return errors.Errorf("Unable to find fid: %d", f.FID) + } + if vlog.iteratorCount() == 0 { + delete(vlog.filesMap, f.FID) + //deleteFileNow = true + } else { + vlog.filesToBeDeleted = append(vlog.filesToBeDeleted, f.FID) + } + vlog.filesLock.Unlock() + } + + if deleteFileNow { + if err := vlog.deleteLogFile(f); err != nil { + return err + } + } + + return nil +} + +func (vlog *valueLog) iteratorCount() int { + return int(atomic.LoadInt32(&vlog.numActiveIterators)) +} + +// TODO 在迭代器close时,需要调用此函数,关闭已经被判定需要移除的logfile +func (vlog *valueLog) decrIteratorCount() error { + num := atomic.AddInt32(&vlog.numActiveIterators, -1) + if num != 0 { + return nil + } + + vlog.filesLock.Lock() + lfs := make([]*file.LogFile, 0, len(vlog.filesToBeDeleted)) + for _, id := range vlog.filesToBeDeleted { + lfs = append(lfs, vlog.filesMap[id]) + delete(vlog.filesMap, id) + } + vlog.filesToBeDeleted = nil + vlog.filesLock.Unlock() + + for _, lf := range lfs { + if err := vlog.deleteLogFile(lf); err != nil { + return err + } + } + return nil +} + +func (vlog *valueLog) deleteLogFile(lf *file.LogFile) error { + if lf == nil { + return nil + } + lf.Lock.Lock() + defer lf.Lock.Unlock() + utils.Err(lf.Close()) + return os.Remove(lf.FileName()) +} + +// validateWrites 可以检查当前的req是否能写入vlog日志,一个vlog日志最大4GB +func (vlog *valueLog) validateWrites(reqs []*request) error { + vlogOffset := uint64(vlog.woffset()) + for _, req := range reqs { + // calculate size of the request. + size := estimateRequestSize(req) + estimatedVlogOffset := vlogOffset + size + if estimatedVlogOffset > uint64(utils.MaxVlogFileSize) { + return errors.Errorf("Request size offset %d is bigger than maximum offset %d", + estimatedVlogOffset, utils.MaxVlogFileSize) + } + + if estimatedVlogOffset >= uint64(vlog.opt.ValueLogFileSize) { + // We'll create a new vlog file if the estimated offset is greater or equal to + // max vlog size. So, resetting the vlogOffset. + vlogOffset = 0 + continue + } + // Estimated vlog offset will become current vlog offset if the vlog is not rotated. + vlogOffset = estimatedVlogOffset + } + return nil +} + +// estimateRequestSize returns the size that needed to be written for the given request. +func estimateRequestSize(req *request) uint64 { + size := uint64(0) + for _, e := range req.Entries { + size += uint64(utils.MaxHeaderSize + len(e.Key) + len(e.Value) + crc32.Size) + } + return size +} + +// getUnlockCallback will returns a function which unlock the logfile if the logfile is mmaped. +// otherwise, it unlock the logfile and return nil. +func (vlog *valueLog) getUnlockCallback(lf *file.LogFile) func() { + if lf == nil { + return nil + } + return lf.Lock.RUnlock +} + +// readValueBytes return vlog entry slice and read locked log file. Caller should take care of +// logFile unlocking. +func (vlog *valueLog) readValueBytes(vp *utils.ValuePtr) ([]byte, *file.LogFile, error) { + lf, err := vlog.getFileRLocked(vp) + if err != nil { + return nil, nil, err + } + + buf, err := lf.Read(vp) + return buf, lf, err +} + +// Gets the logFile and acquires and RLock() for the mmap. You must call RUnlock on the file +// (if non-nil) +func (vlog *valueLog) getFileRLocked(vp *utils.ValuePtr) (*file.LogFile, error) { + vlog.filesLock.RLock() + defer vlog.filesLock.RUnlock() + ret, ok := vlog.filesMap[vp.Fid] + if !ok { + // log file has gone away, we can't do anything. Return. + return nil, errors.Errorf("file with ID: %d not found", vp.Fid) + } + + // Check for valid offset if we are reading from writable log. + maxFid := vlog.maxFid + if vp.Fid == maxFid { + currentOffset := vlog.woffset() + if vp.Offset >= currentOffset { + return nil, errors.Errorf( + "Invalid value pointer offset: %d greater than current offset: %d", + vp.Offset, currentOffset) + } + } + + ret.Lock.RLock() + return ret, nil +} + +func (vlog *valueLog) woffset() uint32 { + return atomic.LoadUint32(&vlog.writableLogOffset) +} + +func (vlog *valueLog) populateFilesMap() error { + vlog.filesMap = make(map[uint32]*file.LogFile) + + files, err := ioutil.ReadDir(vlog.dirPath) + if err != nil { + return utils.WarpErr(fmt.Sprintf("Unable to open log dir. path[%s]", vlog.dirPath), err) + } + + found := make(map[uint64]struct{}) + for _, f := range files { + if !strings.HasSuffix(f.Name(), ".vlog") { + continue + } + fsz := len(f.Name()) + fid, err := strconv.ParseUint(f.Name()[:fsz-5], 10, 32) + if err != nil { + return utils.WarpErr(fmt.Sprintf("Unable to parse log id. name:[%s]", f.Name()), err) + } + if _, ok := found[fid]; ok { + return utils.WarpErr(fmt.Sprintf("Duplicate file found. Please delete one. name:[%s]", f.Name()), err) + } + found[fid] = struct{}{} + + lf := &file.LogFile{ + FID: uint32(fid), + Lock: sync.RWMutex{}, + } + vlog.filesMap[uint32(fid)] = lf + if vlog.maxFid < uint32(fid) { + vlog.maxFid = uint32(fid) + } + } + return nil +} + +func (vlog *valueLog) createVlogFile(fid uint32) (*file.LogFile, error) { + path := vlog.fpath(fid) + + lf := &file.LogFile{ + FID: fid, + Lock: sync.RWMutex{}, + } + + var err error + utils.Panic2(nil, lf.Open(&file.Options{ + FID: uint64(fid), + FileName: path, + Dir: vlog.dirPath, + Path: vlog.dirPath, + MaxSz: 2 * vlog.db.opt.ValueLogFileSize, + })) + + removeFile := func() { + // 如果处理出错 则直接删除文件 + utils.Err(os.Remove(lf.FileName())) + } + + if err = lf.Bootstrap(); err != nil { + removeFile() + return nil, err + } + + if err = utils.SyncDir(vlog.dirPath); err != nil { + removeFile() + return nil, utils.WarpErr(fmt.Sprintf("Sync value log dir[%s]", vlog.dirPath), err) + } + vlog.filesLock.Lock() + vlog.filesMap[fid] = lf + vlog.maxFid = fid + // 现在header才是0 + atomic.StoreUint32(&vlog.writableLogOffset, utils.VlogHeaderSize) + vlog.numEntriesWritten = 0 + vlog.filesLock.Unlock() + return lf, nil +} + +// sortedFids returns the file id's not pending deletion, sorted. Assumes we have shared access to +// filesMap. +func (vlog *valueLog) sortedFids() []uint32 { + toBeDeleted := make(map[uint32]struct{}) + for _, fid := range vlog.filesToBeDeleted { + toBeDeleted[fid] = struct{}{} + } + ret := make([]uint32, 0, len(vlog.filesMap)) + for fid := range vlog.filesMap { + if _, ok := toBeDeleted[fid]; !ok { + ret = append(ret, fid) + } + } + sort.Slice(ret, func(i, j int) bool { + return ret[i] < ret[j] + }) + return ret +} + +func (vlog *valueLog) replayLog(lf *file.LogFile, offset uint32, replayFn utils.LogEntry) error { + // Alright, let's iterate now. + endOffset, err := vlog.iterate(lf, offset, replayFn) + if err != nil { + return errors.Wrapf(err, "Unable to replay logfile:[%s]", lf.FileName()) + } + if int64(endOffset) == int64(lf.Size()) { + return nil + } + + // TODO: 如果vlog日志损坏怎么办? 当前默认是截断损坏的数据 + + // The entire file should be truncated (i.e. it should be deleted). + // If fid == maxFid then it's okay to truncate the entire file since it will be + // used for future additions. Also, it's okay if the last file has size zero. + // We mmap 2*opt.ValueLogSize for the last file. See vlog.Open() function + // if endOffset <= vlogHeaderSize && lf.fid != vlog.maxFid { + + if endOffset <= utils.VlogHeaderSize { + if lf.FID != vlog.maxFid { + return utils.ErrDeleteVlogFile + } + return lf.Bootstrap() + } + + fmt.Printf("Truncating vlog file %s to offset: %d\n", lf.FileName(), endOffset) + if err := lf.Truncate(int64(endOffset)); err != nil { + return utils.WarpErr( + fmt.Sprintf("Truncation needed at offset %d. Can be done manually as well.", endOffset), err) + } + return nil +} + +// iterate iterates over log file. It doesn't not allocate new memory for every kv pair. +// Therefore, the kv pair is only valid for the duration of fn call. +func (vlog *valueLog) iterate(lf *file.LogFile, offset uint32, fn utils.LogEntry) (uint32, error) { + if offset == 0 { + offset = utils.VlogHeaderSize + } + if int64(offset) == int64(lf.Size()) { + // We're at the end of the file already. No need to do anything. + return offset, nil + } + + // We're not at the end of the file. Let's Seek to the offset and start reading. + if _, err := lf.Seek(int64(offset), io.SeekStart); err != nil { + return 0, errors.Wrapf(err, "Unable to seek, name:%s", lf.FileName()) + } + + reader := bufio.NewReader(lf.FD()) + read := &safeRead{ + k: make([]byte, 10), + v: make([]byte, 10), + recordOffset: offset, + lf: lf, + } + + var validEndOffset uint32 = offset + +loop: + for { + e, err := read.Entry(reader) + switch { + case err == io.EOF: + break loop + case err == io.ErrUnexpectedEOF || err == utils.ErrTruncate: + break loop + case err != nil: + return 0, err + case e == nil: + continue + } + + var vp utils.ValuePtr + vp.Len = uint32(int(e.Hlen) + len(e.Key) + len(e.Value) + crc32.Size) + read.recordOffset += vp.Len + + vp.Offset = e.Offset + vp.Fid = lf.FID + validEndOffset = read.recordOffset + if err := fn(e, &vp); err != nil { + if err == utils.ErrStop { + break + } + return 0, utils.WarpErr(fmt.Sprintf("Iteration function %s", lf.FileName()), err) + } + } + return validEndOffset, nil +} + +// 这个对象用来重放日志 +type safeRead struct { + k []byte + v []byte + recordOffset uint32 + lf *file.LogFile +} + +// Entry reads an entry from the provided reader. It also validates the checksum for every entry +// read. Returns error on failure. +func (r *safeRead) Entry(reader io.Reader) (*utils.Entry, error) { + tee := utils.NewHashReader(reader) + var h utils.Header + hlen, err := h.DecodeFrom(tee) + if err != nil { + return nil, err + } + if h.KLen > uint32(1<<16) { // Key length must be below uint16. + return nil, utils.ErrTruncate + } + kl := int(h.KLen) + if cap(r.k) < kl { + r.k = make([]byte, 2*kl) + } + vl := int(h.VLen) + if cap(r.v) < vl { + r.v = make([]byte, 2*vl) + } + + e := &utils.Entry{} + e.Offset = r.recordOffset + e.Hlen = hlen + buf := make([]byte, h.KLen+h.VLen) + if _, err := io.ReadFull(tee, buf[:]); err != nil { + if err == io.EOF { + err = utils.ErrTruncate + } + return nil, err + } + + e.Key = buf[:h.KLen] + e.Value = buf[h.KLen:] + var crcBuf [crc32.Size]byte + if _, err := io.ReadFull(reader, crcBuf[:]); err != nil { + if err == io.EOF { + err = utils.ErrTruncate + } + return nil, err + } + crc := utils.BytesToU32(crcBuf[:]) + if crc != tee.Sum32() { + return nil, utils.ErrTruncate + } + e.Meta = h.Meta + e.ExpiresAt = h.ExpiresAt + return e, nil +} + +// 统计脏数据 +func (vlog *valueLog) populateDiscardStats() error { + key := utils.KeyWithTs(lfDiscardStatsKey, math.MaxUint64) + var statsMap map[uint32]int64 + vs, err := vlog.db.Get(key) + if err != nil { + return err + } + // Value doesn't exist. + if vs.Meta == 0 && len(vs.Value) == 0 { + return nil + } + val := vs.Value + // Entry is not stored in the LSM tree. + if utils.IsValuePtr(vs) { + var vp utils.ValuePtr + vp.Decode(val) + // Read entry from the value log. + result, cb, err := vlog.read(&vp) + // Copy it before we release the read lock. + val = utils.SafeCopy(nil, result) + utils.RunCallback(cb) + if err != nil { + return err + } + } + if len(val) == 0 { + return nil + } + if err := json.Unmarshal(val, &statsMap); err != nil { + return errors.Wrapf(err, "failed to unmarshal discard stats") + } + fmt.Printf("Value Log Discard stats: %v\n", statsMap) + vlog.lfDiscardStats.flushChan <- statsMap + return nil +} + +func (vlog *valueLog) fpath(fid uint32) string { + return utils.VlogFilePath(vlog.dirPath, fid) +} + +// initVLog +func (db *DB) initVLog() { + vp, _ := db.getHead() + vlog := &valueLog{ + dirPath: db.opt.WorkDir, + filesToBeDeleted: make([]uint32, 0), + lfDiscardStats: &lfDiscardStats{ + m: make(map[uint32]int64), + closer: utils.NewCloser(), + flushChan: make(chan map[uint32]int64, 16), + }, + } + vlog.db = db + vlog.opt = *db.opt + vlog.garbageCh = make(chan struct{}, 1) + if err := vlog.open(db, vp, db.replayFunction()); err != nil { + utils.Panic(err) + } + db.vlog = vlog +} + +// getHead prints all the head pointer in the DB and return the max value. +func (db *DB) getHead() (*utils.ValuePtr, uint64) { + var vptr utils.ValuePtr + return &vptr, 0 +} +func (db *DB) replayFunction() func(*utils.Entry, *utils.ValuePtr) error { + toLSM := func(k []byte, vs utils.ValueStruct) { + db.lsm.Set(&utils.Entry{ + Key: k, + Value: vs.Value, + ExpiresAt: vs.ExpiresAt, + Meta: vs.Meta, + }) + } + + return func(e *utils.Entry, vp *utils.ValuePtr) error { // Function for replaying. + nk := make([]byte, len(e.Key)) + copy(nk, e.Key) + var nv []byte + meta := e.Meta + if db.shouldWriteValueToLSM(e) { + nv = make([]byte, len(e.Value)) + copy(nv, e.Value) + } else { + nv = vp.Encode() + meta = meta | utils.BitValuePointer + } + // Update vhead. If the crash happens while replay was in progess + // and the head is not updated, we will end up replaying all the + // files starting from file zero, again. + db.updateHead([]*utils.ValuePtr{vp}) + + v := utils.ValueStruct{ + Value: nv, + Meta: meta, + ExpiresAt: e.ExpiresAt, + } + // This entry is from a rewrite or via SetEntryAt(..). + toLSM(nk, v) + return nil + } +} + +// updateHead should not be called without the db.Lock() since db.vhead is used +// by the writer go routines and memtable flushing goroutine. +func (db *DB) updateHead(ptrs []*utils.ValuePtr) { + var ptr *utils.ValuePtr + for i := len(ptrs) - 1; i >= 0; i-- { + p := ptrs[i] + if !p.IsZero() { + ptr = p + break + } + } + if ptr.IsZero() { + return + } + + utils.CondPanic(ptr.Less(db.vhead), fmt.Errorf("ptr.Less(db.vhead) is true")) + db.vhead = ptr +} + +// sync 同步一下,刷盘 +func (vlog *valueLog) sync(fid uint32) error { + + vlog.filesLock.RLock() + maxFid := vlog.maxFid + // During replay it is possible to get sync call with fid less than maxFid. + // Because older file has already been synced, we can return from here. + if fid < maxFid || len(vlog.filesMap) == 0 { + vlog.filesLock.RUnlock() + return nil + } + curlf := vlog.filesMap[maxFid] + // Sometimes it is possible that vlog.maxFid has been increased but file creation + // with same id is still in progress and this function is called. In those cases + // entry for the file might not be present in vlog.filesMap. + if curlf == nil { + vlog.filesLock.RUnlock() + return nil + } + curlf.Lock.RLock() + vlog.filesLock.RUnlock() + + err := curlf.Sync() + curlf.Lock.RUnlock() + return err +} + +// Set +func (v *valueLog) set(entry *utils.Entry) error { + return nil +} + +func (v *valueLog) get(entry *utils.Entry) (*utils.Entry, error) { + // valuePtr := utils.ValuePtrDecode(entry.Value) + return nil, nil +} + +// lfDiscardStats 记录丢弃key的数据 +// lfDiscardStats keeps track of the amount of data that could be discarded for +// a given logfile. +type lfDiscardStats struct { + sync.RWMutex + m map[uint32]int64 + flushChan chan map[uint32]int64 + closer *utils.Closer + updatesSinceFlush int +} + +func (vlog *valueLog) flushDiscardStats() { + defer vlog.lfDiscardStats.closer.Done() + + mergeStats := func(stats map[uint32]int64) ([]byte, error) { + vlog.lfDiscardStats.Lock() + defer vlog.lfDiscardStats.Unlock() + for fid, count := range stats { + vlog.lfDiscardStats.m[fid] += count + vlog.lfDiscardStats.updatesSinceFlush++ + } + + if vlog.lfDiscardStats.updatesSinceFlush > discardStatsFlushThreshold { + encodedDS, err := json.Marshal(vlog.lfDiscardStats.m) + if err != nil { + return nil, err + } + vlog.lfDiscardStats.updatesSinceFlush = 0 + return encodedDS, nil + } + return nil, nil + } + + process := func(stats map[uint32]int64) error { + encodedDS, err := mergeStats(stats) + if err != nil || encodedDS == nil { + return err + } + + entries := []*utils.Entry{{ + Key: utils.KeyWithTs(lfDiscardStatsKey, 1), + Value: encodedDS, + }} + req, err := vlog.db.sendToWriteCh(entries) + // No special handling of ErrBlockedWrites is required as err is just logged in + // for loop below. + if err != nil { + return errors.Wrapf(err, "failed to push discard stats to write channel") + } + return req.Wait() + } + + closer := vlog.lfDiscardStats.closer + for { + select { + case <-closer.CloseSignal: + // For simplicity just return without processing already present in stats in flushChan. + return + case stats := <-vlog.lfDiscardStats.flushChan: + if err := process(stats); err != nil { + utils.Err(fmt.Errorf("unable to process discardstats with error: %s", err)) + } + } + } +} + +// 请求池 +var requestPool = sync.Pool{ + New: func() interface{} { + return new(request) + }, +} + +// request +type request struct { + // Input values + Entries []*utils.Entry + // Output values and wait group stuff below + Ptrs []*utils.ValuePtr + Wg sync.WaitGroup + Err error + ref int32 +} + +func (req *request) reset() { + req.Entries = req.Entries[:0] + req.Ptrs = req.Ptrs[:0] + req.Wg = sync.WaitGroup{} + req.Err = nil + req.ref = 0 +} + +// GC 部分 +// 选择需要gc的log文件 +func (vlog *valueLog) pickLog(head *utils.ValuePtr) (files []*file.LogFile) { + vlog.filesLock.RLock() + defer vlog.filesLock.RUnlock() + fids := vlog.sortedFids() + switch { + // 只有一个log文件那不需要进行GC了 + case len(fids) <= 1: + return nil + // fid 是0说明是初次启动,更不需要gc了 + // TODO 先不处理head + // case head.Fid == 0: + // return nil + } + + // 创建一个候选对象 + candidate := struct { + fid uint32 + discard int64 + }{math.MaxUint32, 0} + // 加锁遍历fids,选择小于等于head fid的列表中discard统计最大的那个log文件 + // discard 就是在compact过程中统计的可丢弃key的数量 + vlog.lfDiscardStats.RLock() + for _, fid := range fids { + if fid >= head.Fid { + break + } + if vlog.lfDiscardStats.m[fid] > candidate.discard { + candidate.fid = fid + candidate.discard = vlog.lfDiscardStats.m[fid] + } + } + vlog.lfDiscardStats.RUnlock() + + // 说明这是一个有效候选 + if candidate.fid != math.MaxUint32 { // Found a candidate + files = append(files, vlog.filesMap[candidate.fid]) + } + + // 再补充一种随机选择的fid,比如应对初次执行时discard的统计不充分的情况 + var idxHead int + for i, fid := range fids { + if fid == head.Fid { + idxHead = i + break + } + } + if idxHead == 0 { // Not found or first file + idxHead = 1 // 开始对 + } + idx := rand.Intn(idxHead) // Don’t include head.Fid. We pick a random file before it. + if idx > 0 { + idx = rand.Intn(idx + 1) // Another level of rand to favor smaller fids. + } + files = append(files, vlog.filesMap[fids[idx]]) + return files +} + +//sampler 采样器 +type sampler struct { + lf *file.LogFile + sizeRatio float64 + countRatio float64 + fromBeginning bool +} + +func (vlog *valueLog) sample(samp *sampler, discardRatio float64) (*reason, error) { + sizePercent := samp.sizeRatio + countPercent := samp.countRatio + fileSize := samp.lf.Size() + // Set up the sampling winxdow sizes. + sizeWindow := float64(fileSize) * sizePercent + sizeWindowM := sizeWindow / (1 << 20) // in MBs. + countWindow := int(float64(vlog.opt.ValueLogMaxEntries) * countPercent) + + var skipFirstM float64 + var err error + // Skip data only if fromBeginning is set to false. Pick a random start point. + if !samp.fromBeginning { + // Pick a random start point for the log. + skipFirstM = float64(rand.Int63n(fileSize)) // Pick a random starting location. + skipFirstM -= sizeWindow // Avoid hitting EOF by moving back by window. + skipFirstM /= float64(utils.Mi) // Convert to MBs. + } + var skipped float64 + + var r reason + start := time.Now() + var numIterations int + // 重放遍历vlog文件 + _, err = vlog.iterate(samp.lf, 0, func(e *utils.Entry, vp *utils.ValuePtr) error { + numIterations++ + esz := float64(vp.Len) / (1 << 20) // in MBs. + if skipped < skipFirstM { + skipped += esz + return nil + } + // Sample until we reach the window sizes or exceed 10 seconds. + if r.count > countWindow { + return utils.ErrStop + } + if r.total > sizeWindowM { + return utils.ErrStop + } + if time.Since(start) > 10*time.Second { + return utils.ErrStop + } + r.total += esz + r.count++ + + entry, err := vlog.db.Get(e.Key) + if err != nil { + return err + } + if utils.DiscardEntry(e, entry) { + r.discard += esz + return nil + } + + // Value is still present in value log. + utils.CondPanic(len(entry.Value) <= 0, fmt.Errorf("len(entry.Value) <= 0")) + vp.Decode(entry.Value) + + if vp.Fid > samp.lf.FID { + // Value is present in a later log. Discard. + r.discard += esz + return nil + } + if vp.Offset > e.Offset { + // Value is present in a later offset, but in the same log. + r.discard += esz + return nil + } + return nil + }) + + if err != nil { + return nil, err + } + fmt.Printf("Fid: %d. Skipped: %5.2fMB Num iterations: %d. Data status=%+v\n", + samp.lf.FID, skipped, numIterations, r) + // If we couldn't sample at least a 1000 KV pairs or at least 75% of the window size, + // and what we can discard is below the threshold, we should skip the rewrite. + if (r.count < countWindow && r.total < sizeWindowM*0.75) || r.discard < discardRatio*r.total { + fmt.Printf("Skipping GC on fid: %d", samp.lf.FID) + return nil, utils.ErrNoRewrite + } + return &r, nil +} +func (vlog *valueLog) waitOnGC(lc *utils.Closer) { + defer lc.Done() + + <-lc.CloseSignal // Wait for lc to be closed. + + // Block any GC in progress to finish, and don't allow any more writes to runGC by filling up + // the channel of size 1. + vlog.garbageCh <- struct{}{} +} + +type reason struct { + total float64 + discard float64 + count int +} diff --git a/vlog/gc.go b/vlog/gc.go deleted file mode 100644 index 96b50b6..0000000 --- a/vlog/gc.go +++ /dev/null @@ -1 +0,0 @@ -package vlog diff --git a/vlog/vlog.go b/vlog/vlog.go deleted file mode 100644 index 96c9da8..0000000 --- a/vlog/vlog.go +++ /dev/null @@ -1,47 +0,0 @@ -package vlog - -import ( - "github.com/hardcore-os/corekv/utils" - "github.com/hardcore-os/corekv/utils/codec" -) - -type Options struct { -} - -// VLog -type VLog struct { - closer *utils.Closer -} - -// Close 关闭资源 -func (v *VLog) Close() error { - return nil -} - -// NewVLog -func NewVLog(opt *Options) *VLog { - v := &VLog{} - v.closer = utils.NewCloser(1) - return v -} - -// StartGC -func (v *VLog) StartGC() { - defer v.closer.Done() - for { - select { - case <-v.closer.Wait(): - } - // gc logic... - } -} - -// Set -func (v *VLog) Set(entry *codec.Entry) error { - return nil -} - -func (v *VLog) Get(entry *codec.Entry) (*codec.Entry, error) { - // valuePtr := codec.ValuePtrDecode(entry.Value) - return nil, nil -} diff --git a/vlog_test.go b/vlog_test.go new file mode 100644 index 0000000..8dea322 --- /dev/null +++ b/vlog_test.go @@ -0,0 +1,158 @@ +// Copyright 2021 hardcore-os Project Authors +// +// Licensed under the Apache License, Version 2.0 (the "License") +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package corekv + +import ( + "bytes" + "math/rand" + "os" + "testing" + + "github.com/hardcore-os/corekv/utils" + "github.com/stretchr/testify/require" +) + +var ( + // 初始化opt + opt = &Options{ + WorkDir: "./work_test", + SSTableMaxSz: 1 << 10, + MemTableSize: 1 << 10, + ValueLogFileSize: 1 << 20, + ValueThreshold: 0, + MaxBatchCount: 10, + MaxBatchSize: 1 << 20, + } +) + +func TestVlogBase(t *testing.T) { + // 清理目录 + clearDir() + // 打开DB + db := Open(opt) + defer db.Close() + log := db.vlog + var err error + // 创建一个简单的kv entry对象 + const val1 = "sampleval012345678901234567890123" + const val2 = "samplevalb012345678901234567890123" + require.True(t, int64(len(val1)) >= db.opt.ValueThreshold) + + e1 := &utils.Entry{ + Key: []byte("samplekey"), + Value: []byte(val1), + Meta: utils.BitValuePointer, + } + e2 := &utils.Entry{ + Key: []byte("samplekeyb"), + Value: []byte(val2), + Meta: utils.BitValuePointer, + } + + // 构建一个批量请求的request + b := new(request) + b.Entries = []*utils.Entry{e1, e2} + + // 直接写入vlog中 + log.write([]*request{b}) + require.Len(t, b.Ptrs, 2) + t.Logf("Pointer written: %+v %+v\n", b.Ptrs[0], b.Ptrs[1]) + + // 从vlog中使用 value ptr指针中查询写入的分段vlog文件 + buf1, lf1, err1 := log.readValueBytes(b.Ptrs[0]) + buf2, lf2, err2 := log.readValueBytes(b.Ptrs[1]) + require.NoError(t, err1) + require.NoError(t, err2) + // 关闭会调的锁 + defer utils.RunCallback(log.getUnlockCallback(lf1)) + defer utils.RunCallback((log.getUnlockCallback(lf2))) + e1, err = lf1.DecodeEntry(buf1, b.Ptrs[0].Offset) + require.NoError(t, err) + // 从vlog文件中通过指指针反序列化回 entry对象 + e2, err = lf1.DecodeEntry(buf2, b.Ptrs[1].Offset) + require.NoError(t, err) + + // 比较entry对象是否相等 + readEntries := []utils.Entry{*e1, *e2} + require.EqualValues(t, []utils.Entry{ + { + Key: []byte("samplekey"), + Value: []byte(val1), + Meta: utils.BitValuePointer, + Offset: b.Ptrs[0].Offset, + }, + { + Key: []byte("samplekeyb"), + Value: []byte(val2), + Meta: utils.BitValuePointer, + Offset: b.Ptrs[1].Offset, + }, + }, readEntries) +} + +func clearDir() { + _, err := os.Stat(opt.WorkDir) + if err == nil { + os.RemoveAll(opt.WorkDir) + } + os.Mkdir(opt.WorkDir, os.ModePerm) +} + +func TestValueGC(t *testing.T) { + clearDir() + opt.ValueLogFileSize = 1 << 20 + kv := Open(opt) + defer kv.Close() + sz := 32 << 10 + kvList := []*utils.Entry{} + for i := 0; i < 100; i++ { + e := newRandEntry(sz) + kvList = append(kvList, &utils.Entry{ + Key: e.Key, + Value: e.Value, + Meta: e.Meta, + ExpiresAt: e.ExpiresAt, + }) + require.NoError(t, kv.Set(e)) + } + kv.RunValueLogGC(0.9) + for _, e := range kvList { + item, err := kv.Get(e.Key) + require.NoError(t, err) + val := getItemValue(t, item) + require.NotNil(t, val) + require.True(t, bytes.Equal(item.Key, e.Key), "key not equal: e:%s, v:%s", e.Key, e.Key) + require.True(t, bytes.Equal(item.Value, e.Value), "value not equal: e:%s, v:%s", e.Value, e.Value) + } +} + +func newRandEntry(sz int) *utils.Entry { + v := make([]byte, sz) + rand.Read(v[:rand.Intn(sz)]) + e := utils.BuildEntry() + e.Value = v + return e +} +func getItemValue(t *testing.T, item *utils.Entry) (val []byte) { + t.Helper() + if item == nil { + return nil + } + var v []byte + v = append(v, item.Value...) + if v == nil { + return nil + } + return v +} diff --git a/work_test/00001.sst b/work_test/00001.sst deleted file mode 100644 index ba17448..0000000 --- a/work_test/00001.sst +++ /dev/null @@ -1,4 +0,0 @@ -{ - "idx": "hello,0", - "data": "world" -} \ No newline at end of file diff --git a/work_test/manifest b/work_test/manifest deleted file mode 100644 index f48d061..0000000 --- a/work_test/manifest +++ /dev/null @@ -1 +0,0 @@ -00001.sst \ No newline at end of file