diff --git a/.gitignore b/.gitignore
index ab81965..1d4f234 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,16 +1,3 @@
-# Binaries for programs and plugins
-*.exe
-*.exe~
-*.dll
-*.so
-*.dylib
-
-# Test binary, built with `go test -c`
-*.test
-
-# Output of the go coverage tool, specifically when used with LiteIDE
-*.out
-.idea
-# Dependency directories (remove the comment below to include it)
-# vendor/
-.vscode
\ No newline at end of file
+.vscode
+work_test
+testdata
\ No newline at end of file
diff --git a/db.go b/db.go
index df86de1..b26c08c 100644
--- a/db.go
+++ b/db.go
@@ -1,55 +1,104 @@
 package corekv
 
 import (
-	"github.com/hardcore-os/corekv/iterator"
+	"expvar"
+	"fmt"
+	"math"
+	"sync"
+	"sync/atomic"
+	"time"
+
 	"github.com/hardcore-os/corekv/lsm"
 	"github.com/hardcore-os/corekv/utils"
-	"github.com/hardcore-os/corekv/utils/codec"
-	"github.com/hardcore-os/corekv/vlog"
+	"github.com/pkg/errors"
 )
 
 type (
 	// coreKV对外提供的功能集合
 	CoreAPI interface {
-		Set(data *codec.Entry) error
-		Get(key []byte) (*codec.Entry, error)
+		Set(data *utils.Entry) error
+		Get(key []byte) (*utils.Entry, error)
 		Del(key []byte) error
-		NewIterator(opt *iterator.Options) iterator.Iterator
+		NewIterator(opt *utils.Options) utils.Iterator
 		Info() *Stats
 		Close() error
 	}
 
 	// DB 对外暴露的接口对象 全局唯一，持有各种资源句柄
 	DB struct {
-		opt   *Options
-		lsm   *lsm.LSM
-		vlog  *vlog.VLog
-		stats *Stats
+		sync.RWMutex
+		opt         *Options
+		lsm         *lsm.LSM
+		vlog        *valueLog
+		stats       *Stats
+		flushChan   chan flushTask // For flushing memtables.
+		writeCh     chan *request
+		blockWrites int32
+		vhead       *utils.ValuePtr
+		logRotates  int32
 	}
 )
 
-func Open(options *Options) *DB {
-	db := &DB{opt: options}
-	// 初始化LSM结构
-	db.lsm = lsm.NewLSM(&lsm.Options{})
+var (
+	head = []byte("!corekv!head") // For storing value offset for replay.
+)
+
+/**
+SSTableMaxSz:        1024,
+MemTableSize:        1024,
+BlockSize:           1024,
+BloomFalsePositive:  0,
+BaseLevelSize:       10 << 20,
+LevelSizeMultiplier: 10,
+BaseTableSize:       2 << 20,
+TableSizeMultiplier: 2,
+NumLevelZeroTables:  15,
+MaxLevelNum:         7,
+NumCompactors:       3,
+*/
+// Open DB
+// TODO 这里是不是要上一个目录锁比较好，防止多个进程打开同一个目录?
+func Open(opt *Options) *DB {
+	c := utils.NewCloser()
+	db := &DB{opt: opt}
 	// 初始化vlog结构
-	db.vlog = vlog.NewVLog(&vlog.Options{})
+	db.initVLog()
+	// 初始化LSM结构
+	db.lsm = lsm.NewLSM(&lsm.Options{
+		WorkDir:             opt.WorkDir,
+		MemTableSize:        opt.MemTableSize,
+		SSTableMaxSz:        opt.SSTableMaxSz,
+		BlockSize:           8 * 1024,
+		BloomFalsePositive:  0, //0.01,
+		BaseLevelSize:       10 << 20,
+		LevelSizeMultiplier: 10,
+		BaseTableSize:       5 << 20,
+		TableSizeMultiplier: 2,
+		NumLevelZeroTables:  15,
+		MaxLevelNum:         7,
+		NumCompactors:       1,
+		DiscardStatsCh:      &(db.vlog.lfDiscardStats.flushChan),
+	})
 	// 初始化统计信息
-	db.stats = newStats(options)
+	db.stats = newStats(opt)
 	// 启动 sstable 的合并压缩过程
-	go db.lsm.StartMerge()
-	// 启动 vlog gc 过程
-	go db.vlog.StartGC()
+	go db.lsm.StartCompacter()
+	// 准备vlog gc
+	c.Add(1)
+	db.writeCh = make(chan *request)
+	db.flushChan = make(chan flushTask, 16)
+	go db.doWrites(c)
 	// 启动 info 统计过程
 	go db.stats.StartStats()
 	return db
 }
 
 func (db *DB) Close() error {
+	db.vlog.lfDiscardStats.closer.Close()
 	if err := db.lsm.Close(); err != nil {
 		return err
 	}
-	if err := db.vlog.Close(); err != nil {
+	if err := db.vlog.close(); err != nil {
 		return err
 	}
 	if err := db.stats.close(); err != nil {
@@ -60,48 +109,307 @@ func (db *DB) Close() error {
 
 func (db *DB) Del(key []byte) error {
 	// 写入一个值为nil的entry 作为墓碑消息实现删除
-	return db.Set(&codec.Entry{
+	return db.Set(&utils.Entry{
 		Key:       key,
 		Value:     nil,
 		ExpiresAt: 0,
 	})
 }
-func (db *DB) Set(data *codec.Entry) error {
+func (db *DB) Set(data *utils.Entry) error {
+	if data == nil || len(data.Key) == 0 {
+		return utils.ErrEmptyKey
+	}
 	// 做一些必要性的检查
 	// 如果value 大于一个阈值 则创建值指针，并将其写入vlog中
-	var valuePtr *codec.ValuePtr
-	if utils.ValueSize(data.Value) > db.opt.ValueThreshold {
-		valuePtr = codec.NewValuePtr(data)
-		// 先写入vlog不会有事务问题，因为如果lsm写入失败，vlog会在GC阶段清理无效的key
-		if err := db.vlog.Set(data); err != nil {
+	var (
+		vp  *utils.ValuePtr
+		err error
+	)
+	data.Key = utils.KeyWithTs(data.Key, math.MaxUint32)
+	// 如果value不应该直接写入LSM 则先写入 vlog文件，这时必须保证vlog具有重放功能
+	// 以便于崩溃后恢复数据
+	if !db.shouldWriteValueToLSM(data) {
+		if vp, err = db.vlog.newValuePtr(data); err != nil {
 			return err
 		}
-	}
-	// 写入LSM, 如果写值指针不空则替换值entry.value的值
-	if valuePtr != nil {
-		data.Value = codec.ValuePtrCodec(valuePtr)
+		data.Meta |= utils.BitValuePointer
+		data.Value = vp.Encode()
 	}
 	return db.lsm.Set(data)
 }
-func (db *DB) Get(key []byte) (*codec.Entry, error) {
+func (db *DB) Get(key []byte) (*utils.Entry, error) {
+	if len(key) == 0 {
+		return nil, utils.ErrEmptyKey
+	}
 	var (
-		entry *codec.Entry
+		entry *utils.Entry
 		err   error
 	)
-	// 检查输入
-	// 从内存表中读取数据
-	if entry, err = db.lsm.Get(key); err == nil {
+	key = utils.KeyWithTs(key, math.MaxUint32)
+	// 从LSM中查询entry，这时不确定entry是不是值指针
+	if entry, err = db.lsm.Get(key); err != nil {
 		return entry, err
 	}
 	// 检查从lsm拿到的value是否是value ptr,是则从vlog中拿值
-	if entry != nil && codec.IsValuePtr(entry) {
-		if entry, err = db.vlog.Get(entry); err == nil {
-			return entry, err
+	if entry != nil && utils.IsValuePtr(entry) {
+		var vp utils.ValuePtr
+		vp.Decode(entry.Value)
+		result, cb, err := db.vlog.read(&vp)
+		defer utils.RunCallback(cb)
+		if err != nil {
+			return nil, err
 		}
+		entry.Value = utils.SafeCopy(nil, result)
 	}
-	return nil, nil
+
+	if isDeletedOrExpired(entry) {
+		return nil, utils.ErrKeyNotFound
+	}
+	return entry, nil
+}
+
+// 判断是否过期 是可删除
+func isDeletedOrExpired(e *utils.Entry) bool {
+	if e.Value == nil {
+		return true
+	}
+	if e.ExpiresAt == 0 {
+		return false
+	}
+
+	return e.ExpiresAt <= uint64(time.Now().Unix())
 }
+
 func (db *DB) Info() *Stats {
 	// 读取stats结构，打包数据并返回
 	return db.stats
 }
+
+// RunValueLogGC triggers a value log garbage collection.
+func (db *DB) RunValueLogGC(discardRatio float64) error {
+	if discardRatio >= 1.0 || discardRatio <= 0.0 {
+		return utils.ErrInvalidRequest
+	}
+	// Find head on disk
+	headKey := utils.KeyWithTs(head, math.MaxUint64)
+	val, err := db.lsm.Get(headKey)
+	if err != nil {
+		if err == utils.ErrKeyNotFound {
+			val = &utils.Entry{
+				Key:   headKey,
+				Value: []byte{},
+			}
+		} else {
+			return errors.Wrap(err, "Retrieving head from on-disk LSM")
+		}
+	}
+
+	// 内部key head 一定是value ptr 不需要检查内容
+	var head utils.ValuePtr
+	if len(val.Value) > 0 {
+		head.Decode(val.Value)
+	}
+
+	// Pick a log file and run GC
+	return db.vlog.runGC(discardRatio, &head)
+}
+
+func (db *DB) shouldWriteValueToLSM(e *utils.Entry) bool {
+	return int64(len(e.Value)) < db.opt.ValueThreshold
+}
+
+func (db *DB) sendToWriteCh(entries []*utils.Entry) (*request, error) {
+	if atomic.LoadInt32(&db.blockWrites) == 1 {
+		return nil, utils.ErrBlockedWrites
+	}
+	var count, size int64
+	for _, e := range entries {
+		size += int64(e.EstimateSize(int(db.opt.ValueThreshold)))
+		count++
+	}
+	if count >= db.opt.MaxBatchCount || size >= db.opt.MaxBatchSize {
+		return nil, utils.ErrTxnTooBig
+	}
+
+	// TODO 尝试使用对象复用，后面entry对象也应该使用
+	req := requestPool.Get().(*request)
+	req.reset()
+	req.Entries = entries
+	req.Wg.Add(1)
+	req.IncrRef()     // for db write
+	db.writeCh <- req // Handled in doWrites.
+	return req, nil
+}
+
+//   Check(kv.BatchSet(entries))
+func (db *DB) batchSet(entries []*utils.Entry) error {
+	req, err := db.sendToWriteCh(entries)
+	if err != nil {
+		return err
+	}
+
+	return req.Wait()
+}
+
+func (db *DB) doWrites(lc *utils.Closer) {
+	defer lc.Done()
+	pendingCh := make(chan struct{}, 1)
+
+	writeRequests := func(reqs []*request) {
+		if err := db.writeRequests(reqs); err != nil {
+			utils.Err(fmt.Errorf("writeRequests: %v", err))
+		}
+		<-pendingCh
+	}
+
+	// This variable tracks the number of pending writes.
+	reqLen := new(expvar.Int)
+
+	reqs := make([]*request, 0, 10)
+	for {
+		var r *request
+		select {
+		case r = <-db.writeCh:
+		case <-lc.CloseSignal:
+			goto closedCase
+		}
+
+		for {
+			reqs = append(reqs, r)
+			reqLen.Set(int64(len(reqs)))
+
+			if len(reqs) >= 3*utils.KVWriteChCapacity {
+				pendingCh <- struct{}{} // blocking.
+				goto writeCase
+			}
+
+			select {
+			// Either push to pending, or continue to pick from writeCh.
+			case r = <-db.writeCh:
+			case pendingCh <- struct{}{}:
+				goto writeCase
+			case <-lc.CloseSignal:
+				goto closedCase
+			}
+		}
+
+	closedCase:
+		// All the pending request are drained.
+		// Don't close the writeCh, because it has be used in several places.
+		for {
+			select {
+			case r = <-db.writeCh:
+				reqs = append(reqs, r)
+			default:
+				pendingCh <- struct{}{} // Push to pending before doing a write.
+				writeRequests(reqs)
+				return
+			}
+		}
+
+	writeCase:
+		go writeRequests(reqs)
+		reqs = make([]*request, 0, 10)
+		reqLen.Set(0)
+	}
+}
+
+// writeRequests is called serially by only one goroutine.
+func (db *DB) writeRequests(reqs []*request) error {
+	if len(reqs) == 0 {
+		return nil
+	}
+
+	done := func(err error) {
+		for _, r := range reqs {
+			r.Err = err
+			r.Wg.Done()
+		}
+	}
+	err := db.vlog.write(reqs)
+	if err != nil {
+		done(err)
+		return err
+	}
+	var count int
+	for _, b := range reqs {
+		if len(b.Entries) == 0 {
+			continue
+		}
+		count += len(b.Entries)
+		if err != nil {
+			done(err)
+			return errors.Wrap(err, "writeRequests")
+		}
+		if err := db.writeToLSM(b); err != nil {
+			done(err)
+			return errors.Wrap(err, "writeRequests")
+		}
+		db.Lock()
+		db.updateHead(b.Ptrs)
+		db.Unlock()
+	}
+	done(nil)
+	return nil
+}
+func (db *DB) writeToLSM(b *request) error {
+	if len(b.Ptrs) != len(b.Entries) {
+		return errors.Errorf("Ptrs and Entries don't match: %+v", b)
+	}
+
+	for i, entry := range b.Entries {
+		if db.shouldWriteValueToLSM(entry) { // Will include deletion / tombstone case.
+			entry.Meta = entry.Meta &^ utils.BitValuePointer
+		} else {
+			entry.Meta = entry.Meta | utils.BitValuePointer
+			entry.Value = b.Ptrs[i].Encode()
+		}
+		db.lsm.Set(entry)
+	}
+	return nil
+}
+func (req *request) IncrRef() {
+	atomic.AddInt32(&req.ref, 1)
+}
+
+func (req *request) DecrRef() {
+	nRef := atomic.AddInt32(&req.ref, -1)
+	if nRef > 0 {
+		return
+	}
+	req.Entries = nil
+	requestPool.Put(req)
+}
+
+func (req *request) Wait() error {
+	req.Wg.Wait()
+	err := req.Err
+	req.DecrRef() // DecrRef after writing to DB.
+	return err
+}
+
+// 结构体
+type flushTask struct {
+	mt           *utils.Skiplist
+	vptr         *utils.ValuePtr
+	dropPrefixes [][]byte
+}
+
+func (db *DB) pushHead(ft flushTask) error {
+	// Ensure we never push a zero valued head pointer.
+	if ft.vptr.IsZero() {
+		return errors.New("Head should not be zero")
+	}
+
+	fmt.Printf("Storing value log head: %+v\n", ft.vptr)
+	val := ft.vptr.Encode()
+
+	// Pick the max commit ts, so in case of crash, our read ts would be higher than all the
+	// commits.
+	headTs := utils.KeyWithTs(head, uint64(time.Now().Unix()/1e9))
+	ft.mt.Add(&utils.Entry{
+		Key:   headTs,
+		Value: val,
+	})
+	return nil
+}
diff --git a/db_test.go b/db_test.go
index 878a953..a73b56f 100644
--- a/db_test.go
+++ b/db_test.go
@@ -1,33 +1,60 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package corekv
 
 import (
-	"github.com/hardcore-os/corekv/iterator"
-	"github.com/hardcore-os/corekv/utils/codec"
+	"fmt"
 	"testing"
 	"time"
+
+	"github.com/hardcore-os/corekv/utils"
 )
 
 func TestAPI(t *testing.T) {
-	opt := NewDefaultOptions()
+	clearDir()
 	db := Open(opt)
 	defer func() { _ = db.Close() }()
 	// 写入
-	e := codec.NewEntry([]byte("hello"), []byte("coreKV")).WithTTL(1 * time.Second)
-	if err := db.Set(e); err != nil {
-		t.Fatal(err)
+	for i := 0; i < 50; i++ {
+		key, val := fmt.Sprintf("key%d", i), fmt.Sprintf("val%d", i)
+		e := utils.NewEntry([]byte(key), []byte(val)).WithTTL(1000 * time.Second)
+		if err := db.Set(e); err != nil {
+			t.Fatal(err)
+		}
+		// 查询
+		if entry, err := db.Get([]byte(key)); err != nil {
+			t.Fatal(err)
+		} else {
+			t.Logf("db.Get key=%s, value=%s, expiresAt=%d", entry.Key, entry.Value, entry.ExpiresAt)
+		}
 	}
-	// 查询
-	if entry, err := db.Get([]byte("hello")); err != nil {
-		t.Fatal(err)
-	} else {
-		t.Logf("db.Get key=%s, value=%s, expiresAt=%d", entry.Key, entry.Value, entry.ExpiresAt)
+
+	for i := 0; i < 40; i++ {
+		key, _ := fmt.Sprintf("key%d", i), fmt.Sprintf("val%d", i)
+		if err := db.Del([]byte(key)); err != nil {
+			t.Fatal(err)
+		}
 	}
+
 	// 迭代器
-	iter := db.NewIterator(&iterator.Options{
+	iter := db.NewIterator(&utils.Options{
 		Prefix: []byte("hello"),
 		IsAsc:  false,
 	})
 	defer func() { _ = iter.Close() }()
+	defer func() { _ = iter.Close() }()
 	for iter.Rewind(); iter.Valid(); iter.Next() {
 		it := iter.Item()
 		t.Logf("db.NewIterator key=%s, value=%s, expiresAt=%d", it.Entry().Key, it.Entry().Value, it.Entry().ExpiresAt)
@@ -37,4 +64,19 @@ func TestAPI(t *testing.T) {
 	if err := db.Del([]byte("hello")); err != nil {
 		t.Fatal(err)
 	}
+
+	for i := 0; i < 10; i++ {
+		key, val := fmt.Sprintf("key%d", i), fmt.Sprintf("val%d", i)
+		e := utils.NewEntry([]byte(key), []byte(val)).WithTTL(1000 * time.Second)
+		if err := db.Set(e); err != nil {
+			t.Fatal(err)
+		}
+		// 查询
+		if entry, err := db.Get([]byte(key)); err != nil {
+			t.Fatal(err)
+		} else {
+			t.Logf("db.Get key=%s, value=%s, expiresAt=%d", entry.Key, entry.Value, entry.ExpiresAt)
+		}
+	}
+
 }
diff --git a/debug.sh b/debug.sh
new file mode 100755
index 0000000..b6af724
--- /dev/null
+++ b/debug.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+###
+ # Copyright 2021 logicrec Project Authors
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License")
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ # 
+ # http://www.apache.org/licenses/LICENSE-2.0
+ # 
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+### 
+
+dlv test -test.run=$1
\ No newline at end of file
diff --git a/file/file.go b/file/file.go
index f86eae2..588e014 100644
--- a/file/file.go
+++ b/file/file.go
@@ -1,7 +1,39 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package file
 
+import "io"
+
+// Options
+type Options struct {
+	FID      uint64
+	FileName string
+	Dir      string
+	Path     string
+	Flag     int
+	MaxSz    int
+}
+
 type CoreFile interface {
-	Write(b []byte) (n int, err error)
-	Read(b []byte) (n int, err error)
 	Close() error
+	Truncature(n int64) error
+	ReName(name string) error
+	NewReader(offset int) io.Reader
+	Bytes(off, sz int) ([]byte, error)
+	AllocateSlice(sz, offset int) ([]byte, int, error)
+	Sync() error
+	Delete() error
+	Slice(offset int) []byte
 }
diff --git a/file/manifest.go b/file/manifest.go
index f64a70c..5e3e4ae 100644
--- a/file/manifest.go
+++ b/file/manifest.go
@@ -1,56 +1,392 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package file
 
 import (
 	"bufio"
-	"encoding/csv"
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"hash/crc32"
 	"io"
+	"os"
+	"path/filepath"
+	"sync"
 
+	"github.com/hardcore-os/corekv/pb"
 	"github.com/hardcore-os/corekv/utils"
+	"github.com/pkg/errors"
 )
 
+// ManifestFile 维护sst文件元信息的文件
+// manifest 比较特殊，不能使用mmap，需要保证实时的写入
+type ManifestFile struct {
+	opt                       *Options
+	f                         *os.File
+	lock                      sync.Mutex
+	deletionsRewriteThreshold int
+	manifest                  *Manifest
+}
+
+// Manifest corekv 元数据状态维护
 type Manifest struct {
-	f      CoreFile
-	tables [][]string // l0-l7 的sst file name
+	Levels    []levelManifest
+	Tables    map[uint64]TableManifest
+	Creations int
+	Deletions int
+}
+
+// TableManifest 包含sst的基本信息
+type TableManifest struct {
+	Level    uint8
+	Checksum []byte // 方便今后扩展
+}
+type levelManifest struct {
+	Tables map[uint64]struct{} // Set of table id's
+}
+
+//TableMeta sst 的一些元信息
+type TableMeta struct {
+	ID       uint64
+	Checksum []byte
+}
+
+// OpenManifestFile 打开manifest文件
+func OpenManifestFile(opt *Options) (*ManifestFile, error) {
+	path := filepath.Join(opt.Dir, utils.ManifestFilename)
+	mf := &ManifestFile{lock: sync.Mutex{}, opt: opt}
+	f, err := os.OpenFile(path, os.O_RDWR, 0)
+	// 如果打开失败 则尝试创建一个新的 manifest file
+	if err != nil {
+		if !os.IsNotExist(err) {
+			return mf, err
+		}
+		m := createManifest()
+		fp, netCreations, err := helpRewrite(opt.Dir, m)
+		utils.CondPanic(netCreations == 0, errors.Wrap(err, utils.ErrReWriteFailure.Error()))
+		if err != nil {
+			return mf, err
+		}
+		mf.f = fp
+		f = fp
+		mf.manifest = m
+		return mf, nil
+	}
+
+	// 如果打开 则对manifest文件重放
+	manifest, truncOffset, err := ReplayManifestFile(f)
+	if err != nil {
+		_ = f.Close()
+		return mf, err
+	}
+	// Truncate file so we don't have a half-written entry at the end.
+	if err := f.Truncate(truncOffset); err != nil {
+		_ = f.Close()
+		return mf, err
+	}
+	if _, err = f.Seek(0, io.SeekEnd); err != nil {
+		_ = f.Close()
+		return mf, err
+	}
+	mf.f = f
+	mf.manifest = manifest
+	return mf, nil
+}
+
+// ReplayManifestFile 对已经存在的manifest文件重新应用所有状态变更
+func ReplayManifestFile(fp *os.File) (ret *Manifest, truncOffset int64, err error) {
+	r := &bufReader{reader: bufio.NewReader(fp)}
+	var magicBuf [8]byte
+	if _, err := io.ReadFull(r, magicBuf[:]); err != nil {
+		return &Manifest{}, 0, utils.ErrBadMagic
+	}
+	if !bytes.Equal(magicBuf[0:4], utils.MagicText[:]) {
+		return &Manifest{}, 0, utils.ErrBadMagic
+	}
+	version := binary.BigEndian.Uint32(magicBuf[4:8])
+	if version != uint32(utils.MagicVersion) {
+		return &Manifest{}, 0,
+			fmt.Errorf("manifest has unsupported version: %d (we support %d)", version, utils.MagicVersion)
+	}
+
+	build := createManifest()
+	var offset int64
+	for {
+		offset = r.count
+		var lenCrcBuf [8]byte
+		_, err := io.ReadFull(r, lenCrcBuf[:])
+		if err != nil {
+			if err == io.EOF || err == io.ErrUnexpectedEOF {
+				break
+			}
+			return &Manifest{}, 0, err
+		}
+		length := binary.BigEndian.Uint32(lenCrcBuf[0:4])
+		var buf = make([]byte, length)
+		if _, err := io.ReadFull(r, buf); err != nil {
+			if err == io.EOF || err == io.ErrUnexpectedEOF {
+				break
+			}
+			return &Manifest{}, 0, err
+		}
+		if crc32.Checksum(buf, utils.CastagnoliCrcTable) != binary.BigEndian.Uint32(lenCrcBuf[4:8]) {
+			return &Manifest{}, 0, utils.ErrBadChecksum
+		}
+
+		var changeSet pb.ManifestChangeSet
+		if err := changeSet.Unmarshal(buf); err != nil {
+			return &Manifest{}, 0, err
+		}
+
+		if err := applyChangeSet(build, &changeSet); err != nil {
+			return &Manifest{}, 0, err
+		}
+	}
+
+	return build, offset, err
+}
+
+// This is not a "recoverable" error -- opening the KV store fails because the MANIFEST file is
+// just plain broken.
+func applyChangeSet(build *Manifest, changeSet *pb.ManifestChangeSet) error {
+	for _, change := range changeSet.Changes {
+		if err := applyManifestChange(build, change); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func applyManifestChange(build *Manifest, tc *pb.ManifestChange) error {
+	switch tc.Op {
+	case pb.ManifestChange_CREATE:
+		if _, ok := build.Tables[tc.Id]; ok {
+			return fmt.Errorf("MANIFEST invalid, table %d exists", tc.Id)
+		}
+		build.Tables[tc.Id] = TableManifest{
+			Level:    uint8(tc.Level),
+			Checksum: append([]byte{}, tc.Checksum...),
+		}
+		for len(build.Levels) <= int(tc.Level) {
+			build.Levels = append(build.Levels, levelManifest{make(map[uint64]struct{})})
+		}
+		build.Levels[tc.Level].Tables[tc.Id] = struct{}{}
+		build.Creations++
+	case pb.ManifestChange_DELETE:
+		tm, ok := build.Tables[tc.Id]
+		if !ok {
+			return fmt.Errorf("MANIFEST removes non-existing table %d", tc.Id)
+		}
+		delete(build.Levels[tm.Level].Tables, tc.Id)
+		delete(build.Tables, tc.Id)
+		build.Deletions++
+	default:
+		return fmt.Errorf("MANIFEST file has invalid manifestChange op")
+	}
+	return nil
+}
+
+func createManifest() *Manifest {
+	levels := make([]levelManifest, 0)
+	return &Manifest{
+		Levels: levels,
+		Tables: make(map[uint64]TableManifest),
+	}
+}
+
+type bufReader struct {
+	reader *bufio.Reader
+	count  int64
+}
+
+func (r *bufReader) Read(p []byte) (n int, err error) {
+	n, err = r.reader.Read(p)
+	r.count += int64(n)
+	return
+}
+
+// asChanges returns a sequence of changes that could be used to recreate the Manifest in its
+// present state.
+func (m *Manifest) asChanges() []*pb.ManifestChange {
+	changes := make([]*pb.ManifestChange, 0, len(m.Tables))
+	for id, tm := range m.Tables {
+		changes = append(changes, newCreateChange(id, int(tm.Level), tm.Checksum))
+	}
+	return changes
+}
+func newCreateChange(id uint64, level int, checksum []byte) *pb.ManifestChange {
+	return &pb.ManifestChange{
+		Id:       id,
+		Op:       pb.ManifestChange_CREATE,
+		Level:    uint32(level),
+		Checksum: checksum,
+	}
+}
+
+// Must be called while appendLock is held.
+func (mf *ManifestFile) rewrite() error {
+	// In Windows the files should be closed before doing a Rename.
+	if err := mf.f.Close(); err != nil {
+		return err
+	}
+	fp, nextCreations, err := helpRewrite(mf.opt.Dir, mf.manifest)
+	if err != nil {
+		return err
+	}
+	mf.manifest.Creations = nextCreations
+	mf.manifest.Deletions = 0
+	mf.f = fp
+	return nil
+}
+
+func helpRewrite(dir string, m *Manifest) (*os.File, int, error) {
+	rewritePath := filepath.Join(dir, utils.ManifestRewriteFilename)
+	// We explicitly sync.
+	fp, err := os.OpenFile(rewritePath, utils.DefaultFileFlag, utils.DefaultFileMode)
+	if err != nil {
+		return nil, 0, err
+	}
+
+	buf := make([]byte, 8)
+	copy(buf[0:4], utils.MagicText[:])
+	binary.BigEndian.PutUint32(buf[4:8], uint32(utils.MagicVersion))
+
+	netCreations := len(m.Tables)
+	changes := m.asChanges()
+	set := pb.ManifestChangeSet{Changes: changes}
+
+	changeBuf, err := set.Marshal()
+	if err != nil {
+		fp.Close()
+		return nil, 0, err
+	}
+	var lenCrcBuf [8]byte
+	binary.BigEndian.PutUint32(lenCrcBuf[0:4], uint32(len(changeBuf)))
+	binary.BigEndian.PutUint32(lenCrcBuf[4:8], crc32.Checksum(changeBuf, utils.CastagnoliCrcTable))
+	buf = append(buf, lenCrcBuf[:]...)
+	buf = append(buf, changeBuf...)
+	if _, err := fp.Write(buf); err != nil {
+		fp.Close()
+		return nil, 0, err
+	}
+	if err := fp.Sync(); err != nil {
+		fp.Close()
+		return nil, 0, err
+	}
+
+	// In Windows the files should be closed before doing a Rename.
+	if err = fp.Close(); err != nil {
+		return nil, 0, err
+	}
+	manifestPath := filepath.Join(dir, utils.ManifestFilename)
+	if err := os.Rename(rewritePath, manifestPath); err != nil {
+		return nil, 0, err
+	}
+	fp, err = os.OpenFile(manifestPath, utils.DefaultFileFlag, utils.DefaultFileMode)
+	if err != nil {
+		return nil, 0, err
+	}
+	if _, err := fp.Seek(0, io.SeekEnd); err != nil {
+		fp.Close()
+		return nil, 0, err
+	}
+	if err := utils.SyncDir(dir); err != nil {
+		fp.Close()
+		return nil, 0, err
+	}
+
+	return fp, netCreations, nil
 }
 
-// WalFile
-func (mf *Manifest) Close() error {
+// Close 关闭文件
+func (mf *ManifestFile) Close() error {
 	if err := mf.f.Close(); err != nil {
 		return err
 	}
 	return nil
 }
 
-// Tables 获取table的list
-func (mf *Manifest) Tables() [][]string {
-	return mf.tables
+// AddChanges 对外暴露的写比那更丰富
+func (mf *ManifestFile) AddChanges(changesParam []*pb.ManifestChange) error {
+	return mf.addChanges(changesParam)
 }
+func (mf *ManifestFile) addChanges(changesParam []*pb.ManifestChange) error {
+	changes := pb.ManifestChangeSet{Changes: changesParam}
+	buf, err := changes.Marshal()
+	if err != nil {
+		return err
+	}
 
-// OpenManifest
-func OpenManifest(opt *Options) *Manifest {
-	mf := &Manifest{
-		f:      OpenMockFile(opt),
-		tables: make([][]string, utils.MaxLevelNum),
+	// TODO 锁粒度可以优化
+	mf.lock.Lock()
+	defer mf.lock.Unlock()
+	if err := applyChangeSet(mf.manifest, &changes); err != nil {
+		return err
 	}
-	reader := csv.NewReader(bufio.NewReader(mf.f))
-	level := 0
-	for {
-		if level > utils.MaxLevelNum {
-			break
+	// Rewrite manifest if it'd shrink by 1/10 and it's big enough to care
+	if mf.manifest.Deletions > utils.ManifestDeletionsRewriteThreshold &&
+		mf.manifest.Deletions > utils.ManifestDeletionsRatio*(mf.manifest.Creations-mf.manifest.Deletions) {
+		if err := mf.rewrite(); err != nil {
+			return err
 		}
-		line, err := reader.Read()
-		if err == io.EOF {
-			break
-		} else if err != nil {
-			panic(err)
+	} else {
+		var lenCrcBuf [8]byte
+		binary.BigEndian.PutUint32(lenCrcBuf[0:4], uint32(len(buf)))
+		binary.BigEndian.PutUint32(lenCrcBuf[4:8], crc32.Checksum(buf, utils.CastagnoliCrcTable))
+		buf = append(lenCrcBuf[:], buf...)
+		if _, err := mf.f.Write(buf); err != nil {
+			return err
 		}
-		if len(mf.tables[level]) == 0 {
-			mf.tables[level] = make([]string, len(line))
+	}
+	err = mf.f.Sync()
+	return err
+}
+
+// AddTableMeta 存储level表到manifest的level中
+func (mf *ManifestFile) AddTableMeta(levelNum int, t *TableMeta) (err error) {
+	mf.addChanges([]*pb.ManifestChange{
+		newCreateChange(t.ID, levelNum, t.Checksum),
+	})
+	return err
+}
+
+// RevertToManifest checks that all necessary table files exist and removes all table files not
+// referenced by the manifest.  idMap is a set of table file id's that were read from the directory
+// listing.
+func (mf *ManifestFile) RevertToManifest(idMap map[uint64]struct{}) error {
+	// 1. Check all files in manifest exist.
+	for id := range mf.manifest.Tables {
+		if _, ok := idMap[id]; !ok {
+			return fmt.Errorf("file does not exist for table %d", id)
 		}
-		for j, tableName := range line {
-			mf.tables[level][j] = tableName
+	}
+
+	// 2. Delete files that shouldn't exist.
+	for id := range idMap {
+		if _, ok := mf.manifest.Tables[id]; !ok {
+			utils.Err(fmt.Errorf("Table file %d  not referenced in MANIFEST", id))
+			filename := utils.FileNameSSTable(mf.opt.Dir, id)
+			if err := os.Remove(filename); err != nil {
+				return errors.Wrapf(err, "While removing table %d", id)
+			}
 		}
-		level++
 	}
-	return mf
+	return nil
+}
+
+// GetManifest manifest
+func (mf *ManifestFile) GetManifest() *Manifest {
+	return mf.manifest
 }
diff --git a/file/mmap_darwin.go b/file/mmap_darwin.go
new file mode 100644
index 0000000..4ff3a10
--- /dev/null
+++ b/file/mmap_darwin.go
@@ -0,0 +1,254 @@
+//go:build darwin
+// +build darwin
+
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package file
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+
+	"github.com/hardcore-os/corekv/utils/mmap"
+	"github.com/pkg/errors"
+)
+
+// MmapFile represents an mmapd file and includes both the buffer to the data and the file descriptor.
+type MmapFile struct {
+	Data []byte
+	Fd   *os.File
+}
+
+// OpenMmapFileUsing os
+func OpenMmapFileUsing(fd *os.File, sz int, writable bool) (*MmapFile, error) {
+	filename := fd.Name()
+	fi, err := fd.Stat()
+	if err != nil {
+		return nil, errors.Wrapf(err, "cannot stat file: %s", filename)
+	}
+
+	var rerr error
+	fileSize := fi.Size()
+	if sz > 0 && fileSize == 0 {
+		// If file is empty, truncate it to sz.
+		if err := fd.Truncate(int64(sz)); err != nil {
+			return nil, errors.Wrapf(err, "error while truncation")
+		}
+		fileSize = int64(sz)
+	}
+
+	// fmt.Printf("Mmaping file: %s with writable: %v filesize: %d\n", fd.Name(), writable, fileSize)
+	buf, err := mmap.Mmap(fd, writable, fileSize) // Mmap up to file size.
+	if err != nil {
+		return nil, errors.Wrapf(err, "while mmapping %s with size: %d", fd.Name(), fileSize)
+	}
+
+	if fileSize == 0 {
+		dir, _ := filepath.Split(filename)
+		go SyncDir(dir)
+	}
+	return &MmapFile{
+		Data: buf,
+		Fd:   fd,
+	}, rerr
+}
+
+// OpenMmapFile opens an existing file or creates a new file. If the file is
+// created, it would truncate the file to maxSz. In both cases, it would mmap
+// the file to maxSz and returned it. In case the file is created, z.NewFile is
+// returned.
+func OpenMmapFile(filename string, flag int, maxSz int) (*MmapFile, error) {
+	// fmt.Printf("opening file %s with flag: %v\n", filename, flag)
+	fd, err := os.OpenFile(filename, flag, 0666)
+	if err != nil {
+		return nil, errors.Wrapf(err, "unable to open: %s", filename)
+	}
+	writable := true
+	if flag == os.O_RDONLY {
+		writable = false
+	}
+	return OpenMmapFileUsing(fd, maxSz, writable)
+}
+
+type mmapReader struct {
+	Data   []byte
+	offset int
+}
+
+func (mr *mmapReader) Read(buf []byte) (int, error) {
+	if mr.offset > len(mr.Data) {
+		return 0, io.EOF
+	}
+	n := copy(buf, mr.Data[mr.offset:])
+	mr.offset += n
+	if n < len(buf) {
+		return n, io.EOF
+	}
+	return n, nil
+}
+
+func (m *MmapFile) NewReader(offset int) io.Reader {
+	return &mmapReader{
+		Data:   m.Data,
+		offset: offset,
+	}
+}
+
+// Bytes returns data starting from offset off of size sz. If there's not enough data, it would
+// return nil slice and io.EOF.
+func (m *MmapFile) Bytes(off, sz int) ([]byte, error) {
+	if len(m.Data[off:]) < sz {
+		return nil, io.EOF
+	}
+	return m.Data[off : off+sz], nil
+}
+
+// Slice returns the slice at the given offset.
+func (m *MmapFile) Slice(offset int) []byte {
+	sz := binary.BigEndian.Uint32(m.Data[offset:])
+	start := offset + 4
+	next := start + int(sz)
+	if next > len(m.Data) {
+		return []byte{}
+	}
+	res := m.Data[start:next]
+	return res
+}
+
+// AllocateSlice allocates a slice of the given size at the given offset.
+func (m *MmapFile) AllocateSlice(sz, offset int) ([]byte, int, error) {
+	start := offset + 4
+
+	// If the file is too small, double its size or increase it by 1GB, whichever is smaller.
+	if start+sz > len(m.Data) {
+		const oneGB = 1 << 30
+		growBy := len(m.Data)
+		if growBy > oneGB {
+			growBy = oneGB
+		}
+		if growBy < sz+4 {
+			growBy = sz + 4
+		}
+		if err := m.Truncature(int64(len(m.Data) + growBy)); err != nil {
+			return nil, 0, err
+		}
+	}
+
+	binary.BigEndian.PutUint32(m.Data[offset:], uint32(sz))
+	return m.Data[start : start+sz], start + sz, nil
+}
+
+const oneGB = 1 << 30
+
+// AppendBuffer 向内存中追加一个buffer，如果空间不足则重新映射，扩大空间
+func (m *MmapFile) AppendBuffer(offset uint32, buf []byte) error {
+	size := len(m.Data)
+	needSize := len(buf)
+	end := int(offset) + needSize
+	if end > size {
+		growBy := size
+		if growBy > oneGB {
+			growBy = oneGB
+		}
+		if growBy < needSize {
+			growBy = needSize
+		}
+		if err := m.Truncature(int64(end)); err != nil {
+			return err
+		}
+	}
+	dLen := copy(m.Data[offset:end], buf)
+	if dLen != needSize {
+		return errors.Errorf("dLen != needSize AppendBuffer failed")
+	}
+	return nil
+}
+
+func (m *MmapFile) Sync() error {
+	if m == nil {
+		return nil
+	}
+	return mmap.Msync(m.Data)
+}
+
+func (m *MmapFile) Delete() error {
+	if m.Fd == nil {
+		return nil
+	}
+
+	if err := mmap.Munmap(m.Data); err != nil {
+		return fmt.Errorf("while munmap file: %s, error: %v\n", m.Fd.Name(), err)
+	}
+	m.Data = nil
+	if err := m.Fd.Truncate(0); err != nil {
+		return fmt.Errorf("while truncate file: %s, error: %v\n", m.Fd.Name(), err)
+	}
+	if err := m.Fd.Close(); err != nil {
+		return fmt.Errorf("while close file: %s, error: %v\n", m.Fd.Name(), err)
+	}
+	return os.Remove(m.Fd.Name())
+}
+
+// Close would close the file. It would also truncate the file if maxSz >= 0.
+func (m *MmapFile) Close() error {
+	if m.Fd == nil {
+		return nil
+	}
+	if err := m.Sync(); err != nil {
+		return fmt.Errorf("while sync file: %s, error: %v\n", m.Fd.Name(), err)
+	}
+	if err := mmap.Munmap(m.Data); err != nil {
+		return fmt.Errorf("while munmap file: %s, error: %v\n", m.Fd.Name(), err)
+	}
+	return m.Fd.Close()
+}
+
+func SyncDir(dir string) error {
+	df, err := os.Open(dir)
+	if err != nil {
+		return errors.Wrapf(err, "while opening %s", dir)
+	}
+	if err := df.Sync(); err != nil {
+		return errors.Wrapf(err, "while syncing %s", dir)
+	}
+	if err := df.Close(); err != nil {
+		return errors.Wrapf(err, "while closing %s", dir)
+	}
+	return nil
+}
+
+// Truncature 兼容接口
+func (m *MmapFile) Truncature(maxSz int64) error {
+	if err := m.Sync(); err != nil {
+		return fmt.Errorf("while sync file: %s, error: %v\n", m.Fd.Name(), err)
+	}
+	if err := mmap.Munmap(m.Data); err != nil {
+		return fmt.Errorf("while munmap file: %s, error: %v\n", m.Fd.Name(), err)
+	}
+	if err := m.Fd.Truncate(maxSz); err != nil {
+		return fmt.Errorf("while truncate file: %s, error: %v\n", m.Fd.Name(), err)
+	}
+	var err error
+	m.Data, err = mmap.Mmap(m.Fd, true, maxSz) // Mmap up to max size.
+	return err
+}
+
+// ReName 兼容接口
+func (m *MmapFile) ReName(name string) error {
+	return nil
+}
diff --git a/file/mmap_linux.go b/file/mmap_linux.go
new file mode 100644
index 0000000..fb2f1b3
--- /dev/null
+++ b/file/mmap_linux.go
@@ -0,0 +1,255 @@
+// +build linux
+
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package file
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+
+	"github.com/hardcore-os/corekv/utils/mmap"
+	"github.com/pkg/errors"
+)
+
+// MmapFile represents an mmapd file and includes both the buffer to the data and the file descriptor.
+type MmapFile struct {
+	Data []byte
+	Fd   *os.File
+}
+
+// OpenMmapFileUsing os
+func OpenMmapFileUsing(fd *os.File, sz int, writable bool) (*MmapFile, error) {
+	filename := fd.Name()
+	fi, err := fd.Stat()
+	if err != nil {
+		return nil, errors.Wrapf(err, "cannot stat file: %s", filename)
+	}
+
+	var rerr error
+	fileSize := fi.Size()
+	if sz > 0 && fileSize == 0 {
+		// If file is empty, truncate it to sz.
+		if err := fd.Truncate(int64(sz)); err != nil {
+			return nil, errors.Wrapf(err, "error while truncation")
+		}
+		fileSize = int64(sz)
+	}
+
+	// fmt.Printf("Mmaping file: %s with writable: %v filesize: %d\n", fd.Name(), writable, fileSize)
+	buf, err := mmap.Mmap(fd, writable, fileSize) // Mmap up to file size.
+	if err != nil {
+		return nil, errors.Wrapf(err, "while mmapping %s with size: %d", fd.Name(), fileSize)
+	}
+
+	if fileSize == 0 {
+		dir, _ := filepath.Split(filename)
+		go SyncDir(dir)
+	}
+	return &MmapFile{
+		Data: buf,
+		Fd:   fd,
+	}, rerr
+}
+
+// OpenMmapFile opens an existing file or creates a new file. If the file is
+// created, it would truncate the file to maxSz. In both cases, it would mmap
+// the file to maxSz and returned it. In case the file is created, z.NewFile is
+// returned.
+func OpenMmapFile(filename string, flag int, maxSz int) (*MmapFile, error) {
+	// fmt.Printf("opening file %s with flag: %v\n", filename, flag)
+	fd, err := os.OpenFile(filename, flag, 0666)
+	if err != nil {
+		return nil, errors.Wrapf(err, "unable to open: %s", filename)
+	}
+	writable := true
+	if flag == os.O_RDONLY {
+		writable = false
+	}
+	// 如果 sst文件层被打开过，则使用其文件原来的大小
+	if fileInfo, err := fd.Stat(); err == nil && fileInfo != nil && fileInfo.Size() > 0 {
+		maxSz = int(fileInfo.Size())
+	}
+	return OpenMmapFileUsing(fd, maxSz, writable)
+}
+
+type mmapReader struct {
+	Data   []byte
+	offset int
+}
+
+func (mr *mmapReader) Read(buf []byte) (int, error) {
+	if mr.offset > len(mr.Data) {
+		return 0, io.EOF
+	}
+	n := copy(buf, mr.Data[mr.offset:])
+	mr.offset += n
+	if n < len(buf) {
+		return n, io.EOF
+	}
+	return n, nil
+}
+
+func (m *MmapFile) NewReader(offset int) io.Reader {
+	return &mmapReader{
+		Data:   m.Data,
+		offset: offset,
+	}
+}
+
+// Bytes returns data starting from offset off of size sz. If there's not enough data, it would
+// return nil slice and io.EOF.
+func (m *MmapFile) Bytes(off, sz int) ([]byte, error) {
+	if len(m.Data[off:]) < sz {
+		return nil, io.EOF
+	}
+	return m.Data[off : off+sz], nil
+}
+
+// Slice returns the slice at the given offset.
+func (m *MmapFile) Slice(offset int) []byte {
+	sz := binary.BigEndian.Uint32(m.Data[offset:])
+	start := offset + 4
+	next := start + int(sz)
+	if next > len(m.Data) {
+		return []byte{}
+	}
+	res := m.Data[start:next]
+	return res
+}
+
+// AllocateSlice allocates a slice of the given size at the given offset.
+func (m *MmapFile) AllocateSlice(sz, offset int) ([]byte, int, error) {
+	start := offset + 4
+
+	// If the file is too small, double its size or increase it by 1GB, whichever is smaller.
+	if start+sz > len(m.Data) {
+		const oneGB = 1 << 30
+		growBy := len(m.Data)
+		if growBy > oneGB {
+			growBy = oneGB
+		}
+		if growBy < sz+4 {
+			growBy = sz + 4
+		}
+		if err := m.Truncature(int64(len(m.Data) + growBy)); err != nil {
+			return nil, 0, err
+		}
+	}
+
+	binary.BigEndian.PutUint32(m.Data[offset:], uint32(sz))
+	return m.Data[start : start+sz], start + sz, nil
+}
+
+const oneGB = 1 << 30
+
+// AppendBuffer 向内存中追加一个buffer，如果空间不足则重新映射，扩大空间
+func (m *MmapFile) AppendBuffer(offset uint32, buf []byte) error {
+	size := len(m.Data)
+	needSize := len(buf)
+	end := int(offset) + needSize
+	if end > size {
+		growBy := size
+		if growBy > oneGB {
+			growBy = oneGB
+		}
+		if growBy < needSize {
+			growBy = needSize
+		}
+		if err := m.Truncature(int64(end)); err != nil {
+			return err
+		}
+	}
+	dLen := copy(m.Data[offset:end], buf)
+	if dLen != needSize {
+		return errors.Errorf("dLen != needSize AppendBuffer failed")
+	}
+	return nil
+}
+
+func (m *MmapFile) Sync() error {
+	if m == nil {
+		return nil
+	}
+	return mmap.Msync(m.Data)
+}
+
+func (m *MmapFile) Delete() error {
+	if m.Fd == nil {
+		return nil
+	}
+
+	if err := mmap.Munmap(m.Data); err != nil {
+		return fmt.Errorf("while munmap file: %s, error: %v\n", m.Fd.Name(), err)
+	}
+	m.Data = nil
+	if err := m.Fd.Truncate(0); err != nil {
+		return fmt.Errorf("while truncate file: %s, error: %v\n", m.Fd.Name(), err)
+	}
+	if err := m.Fd.Close(); err != nil {
+		return fmt.Errorf("while close file: %s, error: %v\n", m.Fd.Name(), err)
+	}
+	return os.Remove(m.Fd.Name())
+}
+
+// Close would close the file. It would also truncate the file if maxSz >= 0.
+func (m *MmapFile) Close() error {
+	if m.Fd == nil {
+		return nil
+	}
+	if err := m.Sync(); err != nil {
+		return fmt.Errorf("while sync file: %s, error: %v\n", m.Fd.Name(), err)
+	}
+	if err := mmap.Munmap(m.Data); err != nil {
+		return fmt.Errorf("while munmap file: %s, error: %v\n", m.Fd.Name(), err)
+	}
+	return m.Fd.Close()
+}
+
+func SyncDir(dir string) error {
+	df, err := os.Open(dir)
+	if err != nil {
+		return errors.Wrapf(err, "while opening %s", dir)
+	}
+	if err := df.Sync(); err != nil {
+		return errors.Wrapf(err, "while syncing %s", dir)
+	}
+	if err := df.Close(); err != nil {
+		return errors.Wrapf(err, "while closing %s", dir)
+	}
+	return nil
+}
+
+// Truncature 兼容接口
+func (m *MmapFile) Truncature(maxSz int64) error {
+	if err := m.Sync(); err != nil {
+		return fmt.Errorf("while sync file: %s, error: %v\n", m.Fd.Name(), err)
+	}
+	if err := m.Fd.Truncate(maxSz); err != nil {
+		return fmt.Errorf("while truncate file: %s, error: %v\n", m.Fd.Name(), err)
+	}
+
+	var err error
+	m.Data, err = mmap.Mremap(m.Data, int(maxSz)) // Mmap up to max size.
+	return err
+}
+
+// ReName 兼容接口
+func (m *MmapFile) ReName(name string) error {
+	return nil
+}
diff --git a/file/mock.go b/file/mock.go
deleted file mode 100644
index 10509aa..0000000
--- a/file/mock.go
+++ /dev/null
@@ -1,43 +0,0 @@
-package file
-
-import (
-	"fmt"
-	"os"
-
-	"github.com/hardcore-os/corekv/utils"
-)
-
-// MockFile
-type MockFile struct {
-	f *os.File
-}
-
-// Close
-func (lf *MockFile) Close() error {
-	if err := lf.f.Close(); err != nil {
-		return err
-	}
-	return nil
-}
-
-func (lf *MockFile) Write(bytes []byte) (int, error) {
-	return lf.f.Write(bytes)
-}
-func (lf *MockFile) Read(bytes []byte) (int, error) {
-	return lf.f.Read(bytes)
-}
-
-// Options
-type Options struct {
-	Name string
-	Dir  string
-}
-
-// OpenMockFile mock 文件
-func OpenMockFile(opt *Options) *MockFile {
-	var err error
-	lf := &MockFile{}
-	lf.f, err = os.Open(fmt.Sprintf("%s/%s", opt.Dir, opt.Name))
-	utils.Panic(err)
-	return lf
-}
diff --git a/file/sstable.go b/file/sstable.go
deleted file mode 100644
index 2a5f5aa..0000000
--- a/file/sstable.go
+++ /dev/null
@@ -1,41 +0,0 @@
-package file
-
-import (
-	"encoding/json"
-	"io/ioutil"
-
-	"github.com/hardcore-os/corekv/utils"
-)
-
-// SSTable 文件的内存封装
-type SSTable struct {
-	f      *MockFile
-	indexs []byte
-	fid    string
-}
-
-// OpenSStable 打开一个 sst文件
-func OpenSStable(opt *Options) *SSTable {
-	return &SSTable{f: OpenMockFile(opt), fid: utils.FID(opt.Name)}
-}
-
-// Indexs 获取sst文件索引
-func (ss *SSTable) Indexs() []byte {
-	if len(ss.indexs) == 0 {
-		bv, _ := ioutil.ReadAll(ss.f)
-		m := make(map[string]interface{}, 0)
-		json.Unmarshal(bv, &m)
-		if idx, ok := m["idx"]; !ok {
-			panic("sst idx is nil")
-		} else {
-			dataStr, _ := idx.(string) // hello,0
-			ss.indexs = []byte(dataStr)
-		}
-	}
-	return ss.indexs
-}
-
-// FID 获取fid
-func (ss *SSTable) FID() string {
-	return ss.fid
-}
diff --git a/file/sstable_darwin.go b/file/sstable_darwin.go
new file mode 100644
index 0000000..2540e4e
--- /dev/null
+++ b/file/sstable_darwin.go
@@ -0,0 +1,196 @@
+// +build darwin
+
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package file
+
+import (
+	"io"
+	"os"
+	"sync"
+	"syscall"
+	"time"
+
+	"github.com/golang/protobuf/proto"
+	"github.com/hardcore-os/corekv/pb"
+	"github.com/hardcore-os/corekv/utils"
+	"github.com/pkg/errors"
+)
+
+// SSTable 文件的内存封装
+type SSTable struct {
+	lock           *sync.RWMutex
+	f              *MmapFile
+	maxKey         []byte
+	minKey         []byte
+	idxTables      *pb.TableIndex
+	hasBloomFilter bool
+	idxLen         int
+	idxStart       int
+	fid            uint64
+	createdAt      time.Time
+}
+
+// OpenSStable 打开一个 sst文件
+func OpenSStable(opt *Options) *SSTable {
+	omf, err := OpenMmapFile(opt.FileName, os.O_CREATE|os.O_RDWR, opt.MaxSz)
+	utils.Err(err)
+	return &SSTable{f: omf, fid: opt.FID, lock: &sync.RWMutex{}}
+}
+
+// Init 初始化
+func (ss *SSTable) Init() error {
+	var ko *pb.BlockOffset
+	var err error
+	if ko, err = ss.initTable(); err != nil {
+		return err
+	}
+	// 从文件中获取创建时间
+	stat, _ := ss.f.Fd.Stat()
+	statType := stat.Sys().(*syscall.Stat_t)
+	ss.createdAt = time.Unix(statType.Atimespec.Sec, statType.Atimespec.Nsec)
+	// init min key
+	keyBytes := ko.GetKey()
+	minKey := make([]byte, len(keyBytes))
+	copy(minKey, keyBytes)
+	ss.minKey = minKey
+	ss.maxKey = minKey
+	return nil
+}
+
+// SetMaxKey max 需要使用table的迭代器，来获取最后一个block的最后一个key
+func (ss *SSTable) SetMaxKey(maxKey []byte) {
+	ss.maxKey = maxKey
+}
+func (ss *SSTable) initTable() (bo *pb.BlockOffset, err error) {
+	readPos := len(ss.f.Data)
+
+	// Read checksum len from the last 4 bytes.
+	readPos -= 4
+	buf := ss.readCheckError(readPos, 4)
+	checksumLen := int(utils.BytesToU32(buf))
+	if checksumLen < 0 {
+		return nil, errors.New("checksum length less than zero. Data corrupted")
+	}
+
+	// Read checksum.
+	readPos -= checksumLen
+	expectedChk := ss.readCheckError(readPos, checksumLen)
+
+	// Read index size from the footer.
+	readPos -= 4
+	buf = ss.readCheckError(readPos, 4)
+	ss.idxLen = int(utils.BytesToU32(buf))
+
+	// Read index.
+	readPos -= ss.idxLen
+	ss.idxStart = readPos
+	data := ss.readCheckError(readPos, ss.idxLen)
+	if err := utils.VerifyChecksum(data, expectedChk); err != nil {
+		return nil, errors.Wrapf(err, "failed to verify checksum for table: %s", ss.f.Fd.Name())
+	}
+	indexTable := &pb.TableIndex{}
+	if err := proto.Unmarshal(data, indexTable); err != nil {
+		return nil, err
+	}
+	ss.idxTables = indexTable
+
+	ss.hasBloomFilter = len(indexTable.BloomFilter) > 0
+	if len(indexTable.GetOffsets()) > 0 {
+		return indexTable.GetOffsets()[0], nil
+	}
+	return nil, errors.New("read index fail, offset is nil")
+}
+
+// Close 关闭
+func (ss *SSTable) Close() error {
+	return ss.f.Close()
+}
+
+// Indexs _
+func (ss *SSTable) Indexs() *pb.TableIndex {
+	return ss.idxTables
+}
+
+// MaxKey 当前最大的key
+func (ss *SSTable) MaxKey() []byte {
+	return ss.maxKey
+}
+
+// MinKey 当前最小的key
+func (ss *SSTable) MinKey() []byte {
+	return ss.minKey
+}
+
+// FID 获取fid
+func (ss *SSTable) FID() uint64 {
+	return ss.fid
+}
+
+// HasBloomFilter _
+func (ss *SSTable) HasBloomFilter() bool {
+	return ss.hasBloomFilter
+}
+
+func (ss *SSTable) read(off, sz int) ([]byte, error) {
+	if len(ss.f.Data) > 0 {
+		if len(ss.f.Data[off:]) < sz {
+			return nil, io.EOF
+		}
+		return ss.f.Data[off : off+sz], nil
+	}
+
+	res := make([]byte, sz)
+	_, err := ss.f.Fd.ReadAt(res, int64(off))
+	return res, err
+}
+func (ss *SSTable) readCheckError(off, sz int) []byte {
+	buf, err := ss.read(off, sz)
+	utils.Panic(err)
+	return buf
+}
+
+// Bytes returns data starting from offset off of size sz. If there's not enough data, it would
+// return nil slice and io.EOF.
+func (ss *SSTable) Bytes(off, sz int) ([]byte, error) {
+	return ss.f.Bytes(off, sz)
+}
+
+// Size 返回底层文件的尺寸
+func (ss *SSTable) Size() int64 {
+	fileStats, err := ss.f.Fd.Stat()
+	utils.Panic(err)
+	return fileStats.Size()
+}
+
+// GetCreatedAt _
+func (ss *SSTable) GetCreatedAt() *time.Time {
+	return &ss.createdAt
+}
+
+// SetCreatedAt _
+func (ss *SSTable) SetCreatedAt(t *time.Time) {
+	ss.createdAt = *t
+}
+
+// Detele _
+func (ss *SSTable) Detele() error {
+	return ss.f.Delete()
+}
+
+// Truncature _
+func (ss *SSTable) Truncature(size int64) error {
+	return ss.f.Truncature(size)
+}
diff --git a/file/sstable_linux.go b/file/sstable_linux.go
new file mode 100644
index 0000000..fd63f9e
--- /dev/null
+++ b/file/sstable_linux.go
@@ -0,0 +1,196 @@
+// +build linux
+
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package file
+
+import (
+	"io"
+	"os"
+	"sync"
+	"syscall"
+	"time"
+
+	"github.com/golang/protobuf/proto"
+	"github.com/hardcore-os/corekv/pb"
+	"github.com/hardcore-os/corekv/utils"
+	"github.com/pkg/errors"
+)
+
+// SSTable 文件的内存封装
+type SSTable struct {
+	lock           *sync.RWMutex
+	f              *MmapFile
+	maxKey         []byte
+	minKey         []byte
+	idxTables      *pb.TableIndex
+	hasBloomFilter bool
+	idxLen         int
+	idxStart       int
+	fid            uint64
+	createdAt      time.Time
+}
+
+// OpenSStable 打开一个 sst文件
+func OpenSStable(opt *Options) *SSTable {
+	omf, err := OpenMmapFile(opt.FileName, os.O_CREATE|os.O_RDWR, opt.MaxSz)
+	utils.Err(err)
+	return &SSTable{f: omf, fid: opt.FID, lock: &sync.RWMutex{}}
+}
+
+// Init 初始化
+func (ss *SSTable) Init() error {
+	var ko *pb.BlockOffset
+	var err error
+	if ko, err = ss.initTable(); err != nil {
+		return err
+	}
+	// 从文件中获取创建时间
+	stat, _ := ss.f.Fd.Stat()
+	statType := stat.Sys().(*syscall.Stat_t)
+	ss.createdAt = time.Unix(statType.Ctim.Sec, statType.Ctim.Nsec)
+	// init min key
+	keyBytes := ko.GetKey()
+	minKey := make([]byte, len(keyBytes))
+	copy(minKey, keyBytes)
+	ss.minKey = minKey
+	ss.maxKey = minKey
+	return nil
+}
+
+// SetMaxKey max 需要使用table的迭代器，来获取最后一个block的最后一个key
+func (ss *SSTable) SetMaxKey(maxKey []byte) {
+	ss.maxKey = maxKey
+}
+func (ss *SSTable) initTable() (bo *pb.BlockOffset, err error) {
+	readPos := len(ss.f.Data)
+
+	// Read checksum len from the last 4 bytes.
+	readPos -= 4
+	buf := ss.readCheckError(readPos, 4)
+	checksumLen := int(utils.BytesToU32(buf))
+	if checksumLen < 0 {
+		return nil, errors.New("checksum length less than zero. Data corrupted")
+	}
+
+	// Read checksum.
+	readPos -= checksumLen
+	expectedChk := ss.readCheckError(readPos, checksumLen)
+
+	// Read index size from the footer.
+	readPos -= 4
+	buf = ss.readCheckError(readPos, 4)
+	ss.idxLen = int(utils.BytesToU32(buf))
+
+	// Read index.
+	readPos -= ss.idxLen
+	ss.idxStart = readPos
+	data := ss.readCheckError(readPos, ss.idxLen)
+	if err := utils.VerifyChecksum(data, expectedChk); err != nil {
+		return nil, errors.Wrapf(err, "failed to verify checksum for table: %s", ss.f.Fd.Name())
+	}
+	indexTable := &pb.TableIndex{}
+	if err := proto.Unmarshal(data, indexTable); err != nil {
+		return nil, err
+	}
+	ss.idxTables = indexTable
+
+	ss.hasBloomFilter = len(indexTable.BloomFilter) > 0
+	if len(indexTable.GetOffsets()) > 0 {
+		return indexTable.GetOffsets()[0], nil
+	}
+	return nil, errors.New("read index fail, offset is nil")
+}
+
+// Close 关闭
+func (ss *SSTable) Close() error {
+	return ss.f.Close()
+}
+
+// Indexs _
+func (ss *SSTable) Indexs() *pb.TableIndex {
+	return ss.idxTables
+}
+
+// MaxKey 当前最大的key
+func (ss *SSTable) MaxKey() []byte {
+	return ss.maxKey
+}
+
+// MinKey 当前最小的key
+func (ss *SSTable) MinKey() []byte {
+	return ss.minKey
+}
+
+// FID 获取fid
+func (ss *SSTable) FID() uint64 {
+	return ss.fid
+}
+
+// HasBloomFilter _
+func (ss *SSTable) HasBloomFilter() bool {
+	return ss.hasBloomFilter
+}
+
+func (ss *SSTable) read(off, sz int) ([]byte, error) {
+	if len(ss.f.Data) > 0 {
+		if len(ss.f.Data[off:]) < sz {
+			return nil, io.EOF
+		}
+		return ss.f.Data[off : off+sz], nil
+	}
+
+	res := make([]byte, sz)
+	_, err := ss.f.Fd.ReadAt(res, int64(off))
+	return res, err
+}
+func (ss *SSTable) readCheckError(off, sz int) []byte {
+	buf, err := ss.read(off, sz)
+	utils.Panic(err)
+	return buf
+}
+
+// Bytes returns data starting from offset off of size sz. If there's not enough data, it would
+// return nil slice and io.EOF.
+func (ss *SSTable) Bytes(off, sz int) ([]byte, error) {
+	return ss.f.Bytes(off, sz)
+}
+
+// Size 返回底层文件的尺寸
+func (ss *SSTable) Size() int64 {
+	fileStats, err := ss.f.Fd.Stat()
+	utils.Panic(err)
+	return fileStats.Size()
+}
+
+// GetCreatedAt _
+func (ss *SSTable) GetCreatedAt() *time.Time {
+	return &ss.createdAt
+}
+
+// SetCreatedAt _
+func (ss *SSTable) SetCreatedAt(t *time.Time) {
+	ss.createdAt = *t
+}
+
+// Detele _
+func (ss *SSTable) Detele() error {
+	return ss.f.Delete()
+}
+
+// Truncature _
+func (ss *SSTable) Truncature(size int64) error {
+	return ss.f.Truncature(size)
+}
diff --git a/file/vlog.go b/file/vlog.go
index b691ba5..e134f32 100644
--- a/file/vlog.go
+++ b/file/vlog.go
@@ -1 +1,187 @@
 package file
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"hash/crc32"
+	"io"
+	"math"
+	"os"
+	"sync"
+	"sync/atomic"
+
+	"github.com/hardcore-os/corekv/utils"
+	"github.com/pkg/errors"
+)
+
+type LogFile struct {
+	Lock sync.RWMutex
+	FID  uint32
+	size uint32
+	f    *MmapFile
+}
+
+func (lf *LogFile) Open(opt *Options) error {
+	var err error
+	lf.FID = uint32(opt.FID)
+	lf.Lock = sync.RWMutex{}
+	lf.f, err = OpenMmapFile(opt.FileName, os.O_CREATE|os.O_RDWR, opt.MaxSz)
+	utils.Panic2(nil, err)
+	fi, err := lf.f.Fd.Stat()
+	if err != nil {
+		return utils.WarpErr("Unable to run file.Stat", err)
+	}
+	// 获取文件尺寸
+	sz := fi.Size()
+	utils.CondPanic(sz > math.MaxUint32, fmt.Errorf("file size: %d greater than %d",
+		uint32(sz), uint32(math.MaxUint32)))
+	lf.size = uint32(sz)
+	// TODO 是否要在这里弄一个header放一些元数据呢?
+	return nil
+}
+
+// Acquire lock on mmap/file if you are calling this
+func (lf *LogFile) Read(p *utils.ValuePtr) (buf []byte, err error) {
+	offset := p.Offset
+	// Do not convert size to uint32, because the lf.fmap can be of size
+	// 4GB, which overflows the uint32 during conversion to make the size 0,
+	// causing the read to fail with ErrEOF. See issue #585.
+	size := int64(len(lf.f.Data))
+	valsz := p.Len
+	lfsz := atomic.LoadUint32(&lf.size)
+	if int64(offset) >= size || int64(offset+valsz) > size ||
+		// Ensure that the read is within the file's actual size. It might be possible that
+		// the offset+valsz length is beyond the file's actual size. This could happen when
+		// dropAll and iterations are running simultaneously.
+		int64(offset+valsz) > int64(lfsz) {
+		err = io.EOF
+	} else {
+		buf, err = lf.f.Bytes(int(offset), int(valsz))
+	}
+	return buf, err
+}
+
+func (lf *LogFile) DoneWriting(offset uint32) error {
+	// Sync before acquiring lock. (We call this from write() and thus know we have shared access
+	// to the fd.)
+	if err := lf.f.Sync(); err != nil {
+		return errors.Wrapf(err, "Unable to sync value log: %q", lf.FileName())
+	}
+
+	// 写嘛 总是要锁一下的
+	lf.Lock.Lock()
+	defer lf.Lock.Unlock()
+
+	// TODO: Confirm if we need to run a file sync after truncation.
+	// Truncation must run after unmapping, otherwise Windows would crap itself.
+	if err := lf.f.Truncature(int64(offset)); err != nil {
+		return errors.Wrapf(err, "Unable to truncate file: %q", lf.FileName())
+	}
+
+	// Reinitialize the log file. This will mmap the entire file.
+	if err := lf.Init(); err != nil {
+		return errors.Wrapf(err, "failed to initialize file %s", lf.FileName())
+	}
+
+	// Previously we used to close the file after it was written and reopen it in read-only mode.
+	// We no longer open files in read-only mode. We keep all vlog files open in read-write mode.
+	return nil
+}
+func (lf *LogFile) Write(offset uint32, buf []byte) (err error) {
+	return lf.f.AppendBuffer(offset, buf)
+}
+func (lf *LogFile) Truncate(offset int64) error {
+	return lf.f.Truncature(offset)
+}
+func (lf *LogFile) Close() error {
+	return lf.f.Close()
+}
+
+func (lf *LogFile) Size() int64 {
+	return int64(atomic.LoadUint32(&lf.size))
+}
+func (lf *LogFile) AddSize(offset uint32) {
+	atomic.StoreUint32(&lf.size, offset)
+}
+
+// 完成log文件的初始化
+func (lf *LogFile) Bootstrap() error {
+	// TODO 是否需要初始化一些内容给vlog文件?
+	return nil
+}
+
+func (lf *LogFile) Init() error {
+	fstat, err := lf.f.Fd.Stat()
+	if err != nil {
+		return errors.Wrapf(err, "Unable to check stat for %q", lf.FileName())
+	}
+	sz := fstat.Size()
+	if sz == 0 {
+		// File is empty. We don't need to mmap it. Return.
+		return nil
+	}
+	utils.CondPanic(sz > math.MaxUint32, fmt.Errorf("[LogFile.Init] sz > math.MaxUint32"))
+	lf.size = uint32(sz)
+	return nil
+}
+func (lf *LogFile) FileName() string {
+	return lf.f.Fd.Name()
+}
+
+func (lf *LogFile) Seek(offset int64, whence int) (ret int64, err error) {
+	return lf.f.Fd.Seek(offset, whence)
+}
+
+func (lf *LogFile) FD() *os.File {
+	return lf.f.Fd
+}
+
+// You must hold lf.lock to sync()
+func (lf *LogFile) Sync() error {
+	return lf.f.Sync()
+}
+
+// encodeEntry will encode entry to the buf
+// layout of entry
+// +--------+-----+-------+-------+
+// | header | key | value | crc32 |
+// +--------+-----+-------+-------+
+func (lf *LogFile) EncodeEntry(e *utils.Entry, buf *bytes.Buffer, offset uint32) (int, error) {
+	h := utils.Header{
+		KLen:      uint32(len(e.Key)),
+		VLen:      uint32(len(e.Value)),
+		ExpiresAt: e.ExpiresAt,
+		Meta:      e.Meta,
+	}
+
+	hash := crc32.New(utils.CastagnoliCrcTable)
+	writer := io.MultiWriter(buf, hash)
+
+	// encode header.
+	var headerEnc [utils.MaxHeaderSize]byte
+	sz := h.Encode(headerEnc[:])
+	utils.Panic2(writer.Write(headerEnc[:sz]))
+	// Encryption is disabled so writing directly to the buffer.
+	utils.Panic2(writer.Write(e.Key))
+	utils.Panic2(writer.Write(e.Value))
+	// write crc32 hash.
+	var crcBuf [crc32.Size]byte
+	binary.BigEndian.PutUint32(crcBuf[:], hash.Sum32())
+	utils.Panic2(buf.Write(crcBuf[:]))
+	// return encoded length.
+	return len(headerEnc[:sz]) + len(e.Key) + len(e.Value) + len(crcBuf), nil
+}
+func (lf *LogFile) DecodeEntry(buf []byte, offset uint32) (*utils.Entry, error) {
+	var h utils.Header
+	hlen := h.Decode(buf)
+	kv := buf[hlen:]
+	e := &utils.Entry{
+		Meta:      h.Meta,
+		ExpiresAt: h.ExpiresAt,
+		Offset:    offset,
+		Key:       kv[:h.KLen],
+		Value:     kv[h.KLen : h.KLen+h.VLen],
+	}
+	return e, nil
+}
diff --git a/file/wal.go b/file/wal.go
index 70d745b..b4123ba 100644
--- a/file/wal.go
+++ b/file/wal.go
@@ -1,24 +1,194 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package file
 
-import "github.com/hardcore-os/corekv/utils/codec"
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"hash/crc32"
+	"io"
+	"os"
+	"sync"
+
+	"github.com/hardcore-os/corekv/utils"
+	"github.com/pkg/errors"
+)
 
+// WalFile _
 type WalFile struct {
-	f *MockFile
+	lock    *sync.RWMutex
+	f       *MmapFile
+	opts    *Options
+	buf     *bytes.Buffer
+	size    uint32
+	writeAt uint32
 }
 
-// WalFile
+// Fid _
+func (wf *WalFile) Fid() uint64 {
+	return wf.opts.FID
+}
+
+// Close _
 func (wf *WalFile) Close() error {
+	fileName := wf.f.Fd.Name()
 	if err := wf.f.Close(); err != nil {
 		return err
 	}
-	return nil
+	return os.Remove(fileName)
+}
+
+// Name _
+func (wf *WalFile) Name() string {
+	return wf.f.Fd.Name()
+}
+
+// Size 当前已经被写入的数据
+func (wf *WalFile) Size() uint32 {
+	return wf.writeAt
 }
-func OpenWalFile(opt *Options) *WalFile { return &WalFile{f: OpenMockFile(opt)} }
 
-func (wf *WalFile) Write(entry *codec.Entry) error {
+// OpenWalFile _
+func OpenWalFile(opt *Options) *WalFile {
+	omf, err := OpenMmapFile(opt.FileName, os.O_CREATE|os.O_RDWR, opt.MaxSz)
+	wf := &WalFile{f: omf, lock: &sync.RWMutex{}, opts: opt}
+	wf.buf = &bytes.Buffer{}
+	wf.size = uint32(len(wf.f.Data))
+	utils.Err(err)
+	return wf
+}
+
+func (wf *WalFile) Write(entry *utils.Entry) error {
 	// 落预写日志简单的同步写即可
 	// 序列化为磁盘结构
-	walData := codec.WalCodec(entry)
-	_, err := wf.f.Write(walData)
-	return err
+	wf.lock.Lock()
+	plen := utils.WalCodec(wf.buf, entry)
+	buf := wf.buf.Bytes()
+	utils.Panic(wf.f.AppendBuffer(wf.writeAt, buf))
+	wf.writeAt += uint32(plen)
+	wf.lock.Unlock()
+	return nil
+}
+
+// Iterate 从磁盘中遍历wal，获得数据
+func (wf *WalFile) Iterate(readOnly bool, offset uint32, fn utils.LogEntry) (uint32, error) {
+	// For now, read directly from file, because it allows
+	reader := bufio.NewReader(wf.f.NewReader(int(offset)))
+	read := SafeRead{
+		K:            make([]byte, 10),
+		V:            make([]byte, 10),
+		RecordOffset: offset,
+		LF:           wf,
+	}
+	var validEndOffset uint32 = offset
+loop:
+	for {
+		e, err := read.MakeEntry(reader)
+		switch {
+		case err == io.EOF:
+			break loop
+		case err == io.ErrUnexpectedEOF || err == utils.ErrTruncate:
+			break loop
+		case err != nil:
+			return 0, err
+		case e.IsZero():
+			break loop
+		}
+
+		var vp utils.ValuePtr // 给kv分离的设计留下扩展,可以不用考虑其作用
+		size := uint32(int(e.LogHeaderLen()) + len(e.Key) + len(e.Value) + crc32.Size)
+		read.RecordOffset += size
+		validEndOffset = read.RecordOffset
+		if err := fn(e, &vp); err != nil {
+			if err == utils.ErrStop {
+				break
+			}
+			return 0, errors.WithMessage(err, "Iteration function")
+		}
+	}
+	return validEndOffset, nil
+}
+
+// Truncate _
+// TODO Truncate 函数
+func (wf *WalFile) Truncate(end int64) error {
+	if end <= 0 {
+		return nil
+	}
+	if fi, err := wf.f.Fd.Stat(); err != nil {
+		return fmt.Errorf("while file.stat on file: %s, error: %v\n", wf.Name(), err)
+	} else if fi.Size() == end {
+		return nil
+	}
+	wf.size = uint32(end)
+	return wf.f.Truncature(end)
+}
+
+// 封装kv分离的读操作
+type SafeRead struct {
+	K []byte
+	V []byte
+
+	RecordOffset uint32
+	LF           *WalFile
+}
+
+// MakeEntry _
+func (r *SafeRead) MakeEntry(reader io.Reader) (*utils.Entry, error) {
+	tee := utils.NewHashReader(reader)
+	var h utils.WalHeader
+	hlen, err := h.Decode(tee)
+	if err != nil {
+		return nil, err
+	}
+	if h.KeyLen > uint32(1<<16) { // Key length must be below uint16.
+		return nil, utils.ErrTruncate
+	}
+	kl := int(h.KeyLen)
+	if cap(r.K) < kl {
+		r.K = make([]byte, 2*kl)
+	}
+	vl := int(h.ValueLen)
+	if cap(r.V) < vl {
+		r.V = make([]byte, 2*vl)
+	}
+
+	e := &utils.Entry{}
+	e.Offset = r.RecordOffset
+	e.Hlen = hlen
+	buf := make([]byte, h.KeyLen+h.ValueLen)
+	if _, err := io.ReadFull(tee, buf[:]); err != nil {
+		if err == io.EOF {
+			err = utils.ErrTruncate
+		}
+		return nil, err
+	}
+	e.Key = buf[:h.KeyLen]
+	e.Value = buf[h.KeyLen:]
+	var crcBuf [crc32.Size]byte
+	if _, err := io.ReadFull(reader, crcBuf[:]); err != nil {
+		if err == io.EOF {
+			err = utils.ErrTruncate
+		}
+		return nil, err
+	}
+	crc := utils.BytesToU32(crcBuf[:])
+	if crc != tee.Sum32() {
+		return nil, utils.ErrTruncate
+	}
+	e.ExpiresAt = h.ExpiresAt
+	return e, nil
 }
diff --git a/gen.sh b/gen.sh
new file mode 100755
index 0000000..0b5f449
--- /dev/null
+++ b/gen.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+### Copyright hardcore-os Project Authors
+###
+ # Licensed under the Apache License, Version 2.0 (the "License")
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ # 
+ # http://www.apache.org/licenses/LICENSE-2.0
+ # 
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+protoDir="pb"
+outDir="pb"
+protoc -I ${protoDir}/  ${protoDir}/pb.proto --gofast_out=plugins=grpc:${outDir}
\ No newline at end of file
diff --git a/go.mod b/go.mod
index 87c7369..4fd9730 100644
--- a/go.mod
+++ b/go.mod
@@ -2,4 +2,13 @@ module github.com/hardcore-os/corekv
 
 go 1.16
 
-require github.com/stretchr/testify v1.7.0
+require (
+	github.com/cespare/xxhash/v2 v2.1.2
+	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/golang/protobuf v1.5.2
+	github.com/pkg/errors v0.9.1
+	github.com/stretchr/testify v1.7.0
+	golang.org/x/sys v0.0.0-20210910150752-751e447fb3d0
+	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
+	google.golang.org/protobuf v1.27.1 // indirect
+)
diff --git a/go.sum b/go.sum
index b380ae4..14b8728 100644
--- a/go.sum
+++ b/go.sum
@@ -1,10 +1,30 @@
-github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
+github.com/cespare/xxhash/v2 v2.1.2 h1:YRXhKfTDauu4ajMg1TPgFO5jnlC2HCbmLXMcTG5cbYE=
+github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
+github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw=
+github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
+github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU=
+github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
+github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
 github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+golang.org/x/sys v0.0.0-20210910150752-751e447fb3d0 h1:xrCZDmdtoloIiooiA9q0OQb9r8HejIHYoHGhGCe1pGg=
+golang.org/x/sys v0.0.0-20210910150752-751e447fb3d0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
+google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
+google.golang.org/protobuf v1.27.1 h1:SnqbnDw1V7RiZcXPx5MEeqPv2s79L9i7BJUlG/+RurQ=
+google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/iterator.go b/iterator.go
index d13ace2..465de15 100644
--- a/iterator.go
+++ b/iterator.go
@@ -1,39 +1,93 @@
+// Copyright 2021 logicrec Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package corekv
 
 import (
-	"github.com/hardcore-os/corekv/iterator"
-	"github.com/hardcore-os/corekv/utils/codec"
+	"github.com/hardcore-os/corekv/lsm"
+	"github.com/hardcore-os/corekv/utils"
 )
 
 type DBIterator struct {
-	iters []iterator.Iterator
+	iitr utils.Iterator
+	vlog *valueLog
 }
 type Item struct {
-	e *codec.Entry
+	e *utils.Entry
 }
 
-func (it *Item) Entry() *codec.Entry {
+func (it *Item) Entry() *utils.Entry {
 	return it.e
 }
-func (db *DB) NewIterator(opt *iterator.Options) iterator.Iterator {
-	dbIter := &DBIterator{}
-	dbIter.iters = make([]iterator.Iterator, 0)
-	dbIter.iters = append(dbIter.iters, db.lsm.NewIterator(opt))
-	return dbIter
+func (db *DB) NewIterator(opt *utils.Options) utils.Iterator {
+	iters := make([]utils.Iterator, 0)
+	iters = append(iters, db.lsm.NewIterators(opt)...)
+
+	res := &DBIterator{
+		vlog: db.vlog,
+		iitr: lsm.NewMergeIterator(iters, opt.IsAsc),
+	}
+	return res
 }
 
 func (iter *DBIterator) Next() {
-	iter.iters[0].Next()
+	iter.iitr.Next()
+	for ; iter.Valid() && iter.Item() == nil; iter.iitr.Next() {
+	}
 }
 func (iter *DBIterator) Valid() bool {
-	return iter.iters[0].Valid()
+	return iter.iitr.Valid()
 }
 func (iter *DBIterator) Rewind() {
-	iter.iters[0].Rewind()
+	iter.iitr.Rewind()
+	for ; iter.Valid() && iter.Item() == nil; iter.iitr.Next() {
+	}
 }
-func (iter *DBIterator) Item() iterator.Item {
-	return iter.iters[0].Item()
+func (iter *DBIterator) Item() utils.Item {
+	// 检查从lsm拿到的value是否是value ptr,是则从vlog中拿值
+	e := iter.iitr.Item().Entry()
+	var value []byte
+
+	if e != nil && utils.IsValuePtr(e) {
+		var vp utils.ValuePtr
+		vp.Decode(e.Value)
+		result, cb, err := iter.vlog.read(&vp)
+		defer utils.RunCallback(cb)
+		if err != nil {
+			return nil
+		}
+		value = utils.SafeCopy(nil, result)
+	}
+
+	if e.IsDeletedOrExpired() || value == nil {
+		return nil
+	}
+
+	res := &utils.Entry{
+		Key:          e.Key,
+		Value:        value,
+		ExpiresAt:    e.ExpiresAt,
+		Meta:         e.Meta,
+		Version:      e.Version,
+		Offset:       e.Offset,
+		Hlen:         e.Hlen,
+		ValThreshold: e.ValThreshold,
+	}
+	return res
 }
 func (iter *DBIterator) Close() error {
-	return nil
+	return iter.iitr.Close()
+}
+func (iter *DBIterator) Seek(key []byte) {
 }
diff --git a/iterator/iterator.go b/iterator/iterator.go
deleted file mode 100644
index f6c710a..0000000
--- a/iterator/iterator.go
+++ /dev/null
@@ -1,19 +0,0 @@
-package iterator
-
-import "github.com/hardcore-os/corekv/utils/codec"
-
-// 迭代器
-type Iterator interface {
-	Next()
-	Valid() bool
-	Rewind()
-	Item() Item
-	Close() error
-}
-type Item interface {
-	Entry() *codec.Entry
-}
-type Options struct {
-	Prefix []byte
-	IsAsc  bool
-}
diff --git a/lsm/builder.go b/lsm/builder.go
new file mode 100644
index 0000000..a6d9c80
--- /dev/null
+++ b/lsm/builder.go
@@ -0,0 +1,479 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package lsm
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+	"io"
+	"math"
+	"os"
+	"sort"
+	"unsafe"
+
+	"github.com/hardcore-os/corekv/file"
+	"github.com/hardcore-os/corekv/pb"
+	"github.com/hardcore-os/corekv/utils"
+)
+
+type tableBuilder struct {
+	sstSize       int64
+	curBlock      *block
+	opt           *Options
+	blockList     []*block
+	keyCount      uint32
+	keyHashes     []uint32
+	maxVersion    uint64
+	baseKey       []byte
+	staleDataSize int
+	estimateSz    int64
+}
+type buildData struct {
+	blockList []*block
+	index     []byte
+	checksum  []byte
+	size      int
+}
+type block struct {
+	offset            int //当前block的offset 首地址
+	checksum          []byte
+	entriesIndexStart int
+	chkLen            int
+	data              []byte
+	baseKey           []byte
+	entryOffsets      []uint32
+	end               int
+	estimateSz        int64
+}
+
+type header struct {
+	overlap uint16 // Overlap with base key.
+	diff    uint16 // Length of the diff.
+}
+
+const headerSize = uint16(unsafe.Sizeof(header{}))
+
+// Decode decodes the header.
+func (h *header) decode(buf []byte) {
+	copy(((*[headerSize]byte)(unsafe.Pointer(h))[:]), buf[:headerSize])
+}
+
+func (h header) encode() []byte {
+	var b [4]byte
+	*(*header)(unsafe.Pointer(&b[0])) = h
+	return b[:]
+}
+
+func (tb *tableBuilder) add(e *utils.Entry, isStale bool) {
+	key := e.Key
+	val := utils.ValueStruct{
+		Meta:      e.Meta,
+		Value:     e.Value,
+		ExpiresAt: e.ExpiresAt,
+	}
+	// 检查是否需要分配一个新的 block
+	if tb.tryFinishBlock(e) {
+		if isStale {
+			// This key will be added to tableIndex and it is stale.
+			tb.staleDataSize += len(key) + 4 /* len */ + 4 /* offset */
+		}
+		tb.finishBlock()
+		// Create a new block and start writing.
+		tb.curBlock = &block{
+			data: make([]byte, tb.opt.BlockSize), // TODO 加密block后块的大小会增加，需要预留一些填充位置
+		}
+	}
+	tb.keyHashes = append(tb.keyHashes, utils.Hash(utils.ParseKey(key)))
+
+	if version := utils.ParseTs(key); version > tb.maxVersion {
+		tb.maxVersion = version
+	}
+
+	var diffKey []byte
+	if len(tb.curBlock.baseKey) == 0 {
+		tb.curBlock.baseKey = append(tb.curBlock.baseKey[:0], key...)
+		diffKey = key
+	} else {
+		diffKey = tb.keyDiff(key)
+	}
+	utils.CondPanic(!(len(key)-len(diffKey) <= math.MaxUint16), fmt.Errorf("tableBuilder.add: len(key)-len(diffKey) <= math.MaxUint16"))
+	utils.CondPanic(!(len(diffKey) <= math.MaxUint16), fmt.Errorf("tableBuilder.add: len(diffKey) <= math.MaxUint16"))
+
+	h := header{
+		overlap: uint16(len(key) - len(diffKey)),
+		diff:    uint16(len(diffKey)),
+	}
+
+	tb.curBlock.entryOffsets = append(tb.curBlock.entryOffsets, uint32(tb.curBlock.end))
+
+	tb.append(h.encode())
+	tb.append(diffKey)
+
+	dst := tb.allocate(int(val.EncodedSize()))
+	val.EncodeValue(dst)
+}
+func newTableBuilerWithSSTSize(opt *Options, size int64) *tableBuilder {
+	return &tableBuilder{
+		opt:     opt,
+		sstSize: size,
+	}
+}
+func newTableBuiler(opt *Options) *tableBuilder {
+	return &tableBuilder{
+		opt:     opt,
+		sstSize: opt.SSTableMaxSz,
+	}
+}
+
+// Empty returns whether it's empty.
+func (tb *tableBuilder) empty() bool { return len(tb.keyHashes) == 0 }
+
+func (tb *tableBuilder) finish() []byte {
+	bd := tb.done()
+	buf := make([]byte, bd.size)
+	written := bd.Copy(buf)
+	utils.CondPanic(written == len(buf), nil)
+	return buf
+}
+func (tb *tableBuilder) tryFinishBlock(e *utils.Entry) bool {
+	if tb.curBlock == nil {
+		return true
+	}
+
+	if len(tb.curBlock.entryOffsets) <= 0 {
+		return false
+	}
+	utils.CondPanic(!((uint32(len(tb.curBlock.entryOffsets))+1)*4+4+8+4 < math.MaxUint32), errors.New("Integer overflow"))
+	entriesOffsetsSize := int64((len(tb.curBlock.entryOffsets)+1)*4 +
+		4 + // size of list
+		8 + // Sum64 in checksum proto
+		4) // checksum length
+	tb.curBlock.estimateSz = int64(tb.curBlock.end) + int64(6 /*header size for entry*/) +
+		int64(len(e.Key)) + int64(e.EncodedSize()) + entriesOffsetsSize
+
+	// Integer overflow check for table size.
+	utils.CondPanic(!(uint64(tb.curBlock.end)+uint64(tb.curBlock.estimateSz) < math.MaxUint32), errors.New("Integer overflow"))
+
+	return tb.curBlock.estimateSz > int64(tb.opt.BlockSize)
+}
+
+// AddStaleKey 记录陈旧key所占用的空间大小，用于日志压缩时的决策
+func (tb *tableBuilder) AddStaleKey(e *utils.Entry) {
+	// Rough estimate based on how much space it will occupy in the SST.
+	tb.staleDataSize += len(e.Key) + len(e.Value) + 4 /* entry offset */ + 4 /* header size */
+	tb.add(e, true)
+}
+
+// AddKey _
+func (tb *tableBuilder) AddKey(e *utils.Entry) {
+	tb.add(e, false)
+}
+
+// Close closes the TableBuilder.
+func (tb *tableBuilder) Close() {
+	// 结合内存分配器
+}
+func (tb *tableBuilder) finishBlock() {
+	if tb.curBlock == nil || len(tb.curBlock.entryOffsets) == 0 {
+		return
+	}
+	// Append the entryOffsets and its length.
+	tb.append(utils.U32SliceToBytes(tb.curBlock.entryOffsets))
+	tb.append(utils.U32ToBytes(uint32(len(tb.curBlock.entryOffsets))))
+
+	checksum := tb.calculateChecksum(tb.curBlock.data[:tb.curBlock.end])
+
+	// Append the block checksum and its length.
+	tb.append(checksum)
+	tb.append(utils.U32ToBytes(uint32(len(checksum))))
+	tb.estimateSz += tb.curBlock.estimateSz
+	tb.blockList = append(tb.blockList, tb.curBlock)
+	// TODO: 预估整理builder写入磁盘后，sst文件的大小
+	tb.keyCount += uint32(len(tb.curBlock.entryOffsets))
+	tb.curBlock = nil // 表示当前block 已经被序列化到内存
+	return
+}
+
+// append appends to curBlock.data
+func (tb *tableBuilder) append(data []byte) {
+	dst := tb.allocate(len(data))
+	utils.CondPanic(len(data) != copy(dst, data), errors.New("tableBuilder.append data"))
+}
+
+func (tb *tableBuilder) allocate(need int) []byte {
+	bb := tb.curBlock
+	if len(bb.data[bb.end:]) < need {
+		// We need to reallocate.
+		sz := 2 * len(bb.data)
+		if bb.end+need > sz {
+			sz = bb.end + need
+		}
+		tmp := make([]byte, sz) // todo 这里可以使用内存分配器来提升性能
+		copy(tmp, bb.data)
+		bb.data = tmp
+	}
+	bb.end += need
+	return bb.data[bb.end-need : bb.end]
+}
+
+func (tb *tableBuilder) calculateChecksum(data []byte) []byte {
+	checkSum := utils.CalculateChecksum(data)
+	return utils.U64ToBytes(checkSum)
+}
+
+func (tb *tableBuilder) keyDiff(newKey []byte) []byte {
+	var i int
+	for i = 0; i < len(newKey) && i < len(tb.curBlock.baseKey); i++ {
+		if newKey[i] != tb.curBlock.baseKey[i] {
+			break
+		}
+	}
+	return newKey[i:]
+}
+
+// TODO: 这里存在多次的用户空间拷贝过程，需要优化
+func (tb *tableBuilder) flush(lm *levelManager, tableName string) (t *table, err error) {
+	bd := tb.done()
+	t = &table{lm: lm, fid: utils.FID(tableName)}
+	// 如果没有builder 则创打开一个已经存在的sst文件
+	t.ss = file.OpenSStable(&file.Options{
+		FileName: tableName,
+		Dir:      lm.opt.WorkDir,
+		Flag:     os.O_CREATE | os.O_RDWR,
+		MaxSz:    int(bd.size)})
+	buf := make([]byte, bd.size)
+	written := bd.Copy(buf)
+	utils.CondPanic(written != len(buf), fmt.Errorf("tableBuilder.flush written != len(buf)"))
+	dst, err := t.ss.Bytes(0, bd.size)
+	if err != nil {
+		return nil, err
+	}
+	copy(dst, buf)
+	return t, nil
+}
+
+func (bd *buildData) Copy(dst []byte) int {
+	var written int
+	for _, bl := range bd.blockList {
+		written += copy(dst[written:], bl.data[:bl.end])
+	}
+	written += copy(dst[written:], bd.index)
+	written += copy(dst[written:], utils.U32ToBytes(uint32(len(bd.index))))
+
+	written += copy(dst[written:], bd.checksum)
+	written += copy(dst[written:], utils.U32ToBytes(uint32(len(bd.checksum))))
+	return written
+}
+
+func (tb *tableBuilder) done() buildData {
+	tb.finishBlock()
+	if len(tb.blockList) == 0 {
+		return buildData{}
+	}
+	bd := buildData{
+		blockList: tb.blockList,
+	}
+
+	var f utils.Filter
+	if tb.opt.BloomFalsePositive > 0 {
+		bits := utils.BloomBitsPerKey(len(tb.keyHashes), tb.opt.BloomFalsePositive)
+		f = utils.NewFilter(tb.keyHashes, bits)
+	}
+	// TODO 构建 sst的索引
+	index, dataSize := tb.buildIndex(f)
+	checksum := tb.calculateChecksum(index)
+	bd.index = index
+	bd.checksum = checksum
+	bd.size = int(dataSize) + len(index) + len(checksum) + 4 + 4
+	return bd
+}
+
+func (tb *tableBuilder) buildIndex(bloom []byte) ([]byte, uint32) {
+	tableIndex := &pb.TableIndex{}
+	if len(bloom) > 0 {
+		tableIndex.BloomFilter = bloom
+	}
+	tableIndex.KeyCount = tb.keyCount
+	tableIndex.MaxVersion = tb.maxVersion
+	tableIndex.Offsets = tb.writeBlockOffsets(tableIndex)
+	var dataSize uint32
+	for i := range tb.blockList {
+		dataSize += uint32(tb.blockList[i].end)
+	}
+	data, err := tableIndex.Marshal()
+	utils.Panic(err)
+	return data, dataSize
+}
+
+func (tb *tableBuilder) writeBlockOffsets(tableIndex *pb.TableIndex) []*pb.BlockOffset {
+	var startOffset uint32
+	var offsets []*pb.BlockOffset
+	for _, bl := range tb.blockList {
+		offset := tb.writeBlockOffset(bl, startOffset)
+		offsets = append(offsets, offset)
+		startOffset += uint32(bl.end)
+	}
+	return offsets
+}
+
+func (b *tableBuilder) writeBlockOffset(bl *block, startOffset uint32) *pb.BlockOffset {
+	offset := &pb.BlockOffset{}
+	offset.Key = bl.baseKey
+	offset.Len = uint32(bl.end)
+	offset.Offset = startOffset
+	return offset
+}
+
+// TODO: 如何能更好的预估builder的长度呢？
+func (b *tableBuilder) ReachedCapacity() bool {
+	return b.estimateSz > b.sstSize
+}
+
+func (b block) verifyCheckSum() error {
+	return utils.VerifyChecksum(b.data, b.checksum)
+}
+
+type blockIterator struct {
+	data         []byte
+	idx          int
+	err          error
+	baseKey      []byte
+	key          []byte
+	val          []byte
+	entryOffsets []uint32
+	block        *block
+
+	tableID uint64
+	blockID int
+
+	prevOverlap uint16
+
+	it utils.Item
+}
+
+func (itr *blockIterator) setBlock(b *block) {
+	itr.block = b
+	itr.err = nil
+	itr.idx = 0
+	itr.baseKey = itr.baseKey[:0]
+	itr.prevOverlap = 0
+	itr.key = itr.key[:0]
+	itr.val = itr.val[:0]
+	// Drop the index from the block. We don't need it anymore.
+	itr.data = b.data[:b.entriesIndexStart]
+	itr.entryOffsets = b.entryOffsets
+}
+
+// seekToFirst brings us to the first element.
+func (itr *blockIterator) seekToFirst() {
+	itr.setIdx(0)
+}
+func (itr *blockIterator) seekToLast() {
+	itr.setIdx(len(itr.entryOffsets) - 1)
+}
+func (itr *blockIterator) seek(key []byte) {
+	itr.err = nil
+	startIndex := 0 // This tells from which index we should start binary search.
+
+	foundEntryIdx := sort.Search(len(itr.entryOffsets), func(idx int) bool {
+		// If idx is less than start index then just return false.
+		if idx < startIndex {
+			return false
+		}
+		itr.setIdx(idx)
+		return utils.CompareKeys(itr.key, key) >= 0
+	})
+	itr.setIdx(foundEntryIdx)
+}
+
+func (itr *blockIterator) setIdx(i int) {
+	itr.idx = i
+	if i >= len(itr.entryOffsets) || i < 0 {
+		itr.err = io.EOF
+		return
+	}
+	itr.err = nil
+	startOffset := int(itr.entryOffsets[i])
+
+	// Set base key.
+	if len(itr.baseKey) == 0 {
+		var baseHeader header
+		baseHeader.decode(itr.data)
+		itr.baseKey = itr.data[headerSize : headerSize+baseHeader.diff]
+	}
+
+	var endOffset int
+	// idx points to the last entry in the block.
+	if itr.idx+1 == len(itr.entryOffsets) {
+		endOffset = len(itr.data)
+	} else {
+		// idx point to some entry other than the last one in the block.
+		// EndOffset of the current entry is the start offset of the next entry.
+		endOffset = int(itr.entryOffsets[itr.idx+1])
+	}
+	defer func() {
+		if r := recover(); r != nil {
+			var debugBuf bytes.Buffer
+			fmt.Fprintf(&debugBuf, "==== Recovered====\n")
+			fmt.Fprintf(&debugBuf, "Table ID: %d\nBlock ID: %d\nEntry Idx: %d\nData len: %d\n"+
+				"StartOffset: %d\nEndOffset: %d\nEntryOffsets len: %d\nEntryOffsets: %v\n",
+				itr.tableID, itr.blockID, itr.idx, len(itr.data), startOffset, endOffset,
+				len(itr.entryOffsets), itr.entryOffsets)
+			panic(debugBuf.String())
+		}
+	}()
+
+	entryData := itr.data[startOffset:endOffset]
+	var h header
+	h.decode(entryData)
+	if h.overlap > itr.prevOverlap {
+		itr.key = append(itr.key[:itr.prevOverlap], itr.baseKey[itr.prevOverlap:h.overlap]...)
+	}
+
+	itr.prevOverlap = h.overlap
+	valueOff := headerSize + h.diff
+	diffKey := entryData[headerSize:valueOff]
+	itr.key = append(itr.key[:h.overlap], diffKey...)
+	e := &utils.Entry{Key: itr.key}
+	val := &utils.ValueStruct{}
+	val.DecodeValue(entryData[valueOff:])
+	itr.val = val.Value
+	e.Value = val.Value
+	e.ExpiresAt = val.ExpiresAt
+	e.Meta = val.Meta
+	itr.it = &Item{e: e}
+}
+
+func (itr *blockIterator) Error() error {
+	return itr.err
+}
+
+func (itr *blockIterator) Next() {
+	itr.setIdx(itr.idx + 1)
+}
+
+func (itr *blockIterator) Valid() bool {
+	return itr.err != io.EOF // TODO 这里用err比较好
+}
+func (itr *blockIterator) Rewind() bool {
+	itr.setIdx(0)
+	return true
+}
+func (itr *blockIterator) Item() utils.Item {
+	return itr.it
+}
+func (itr *blockIterator) Close() error {
+	return nil
+}
diff --git a/lsm/cache.go b/lsm/cache.go
index 24b54b7..4a83c41 100644
--- a/lsm/cache.go
+++ b/lsm/cache.go
@@ -1,15 +1,34 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package lsm
 
-import "github.com/hardcore-os/corekv/utils"
+import (
+	coreCache "github.com/hardcore-os/corekv/utils/cache"
+)
 
 type cache struct {
-	indexs *utils.CoreMap // key fid， value table
-	blocks *utils.CoreMap // key cacheID_blockOffset  value block []byte
+	indexs *coreCache.Cache // key fid， value table
+	blocks *coreCache.Cache // key fid_blockOffset  value block []byte
 }
+
 type blockBuffer struct {
 	b []byte
 }
 
+const defaultCacheSize = 1024
+
 // close
 func (c *cache) close() error {
 	return nil
@@ -17,11 +36,10 @@ func (c *cache) close() error {
 
 // newCache
 func newCache(opt *Options) *cache {
-	return &cache{indexs: utils.NewMap(), blocks: utils.NewMap()}
+	return &cache{indexs: coreCache.NewCache(defaultCacheSize), blocks: coreCache.NewCache(defaultCacheSize)}
 }
 
-
 // TODO fid 使用字符串是不是会有性能损耗
-func (c *cache) addIndex(fid string, t *table) {
+func (c *cache) addIndex(fid uint64, t *table) {
 	c.indexs.Set(fid, t)
 }
diff --git a/lsm/compact.go b/lsm/compact.go
new file mode 100644
index 0000000..1e9c33e
--- /dev/null
+++ b/lsm/compact.go
@@ -0,0 +1,1164 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package lsm
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+	"log"
+	"math"
+	"math/rand"
+	"sort"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/hardcore-os/corekv/pb"
+	"github.com/hardcore-os/corekv/utils"
+)
+
+// 归并优先级
+type compactionPriority struct {
+	level        int
+	score        float64
+	adjusted     float64
+	dropPrefixes [][]byte
+	t            targets
+}
+
+// 归并目标
+type targets struct {
+	baseLevel int
+	targetSz  []int64
+	fileSz    []int64
+}
+type compactDef struct {
+	compactorId int
+	t           targets
+	p           compactionPriority
+	thisLevel   *levelHandler
+	nextLevel   *levelHandler
+
+	top []*table
+	bot []*table
+
+	thisRange keyRange
+	nextRange keyRange
+	splits    []keyRange
+
+	thisSize int64
+
+	dropPrefixes [][]byte
+}
+
+func (cd *compactDef) lockLevels() {
+	cd.thisLevel.RLock()
+	cd.nextLevel.RLock()
+}
+
+func (cd *compactDef) unlockLevels() {
+	cd.nextLevel.RUnlock()
+	cd.thisLevel.RUnlock()
+}
+
+// runCompacter 启动一个compacter
+func (lm *levelManager) runCompacter(id int) {
+	defer lm.lsm.closer.Done()
+	randomDelay := time.NewTimer(time.Duration(rand.Int31n(1000)) * time.Millisecond)
+	select {
+	case <-randomDelay.C:
+	case <-lm.lsm.closer.CloseSignal:
+		randomDelay.Stop()
+		return
+	}
+	//TODO 这个值有待验证
+	ticker := time.NewTicker(50000 * time.Millisecond)
+	defer ticker.Stop()
+	for {
+		select {
+		// Can add a done channel or other stuff.
+		case <-ticker.C:
+			lm.runOnce(id)
+		case <-lm.lsm.closer.CloseSignal:
+			return
+		}
+	}
+}
+
+// runOnce
+func (lm *levelManager) runOnce(id int) bool {
+	prios := lm.pickCompactLevels()
+	if id == 0 {
+		// 0号协程 总是倾向于压缩l0层
+		prios = moveL0toFront(prios)
+	}
+	for _, p := range prios {
+		if id == 0 && p.level == 0 {
+			// 对于l0 无论得分多少都要运行
+		} else if p.adjusted < 1.0 {
+			// 对于其他level 如果等分小于 则不执行
+			break
+		}
+		if lm.run(id, p) {
+			return true
+		}
+	}
+	return false
+}
+func moveL0toFront(prios []compactionPriority) []compactionPriority {
+	idx := -1
+	for i, p := range prios {
+		if p.level == 0 {
+			idx = i
+			break
+		}
+	}
+	// If idx == -1, we didn't find L0.
+	// If idx == 0, then we don't need to do anything. L0 is already at the front.
+	if idx > 0 {
+		out := append([]compactionPriority{}, prios[idx])
+		out = append(out, prios[:idx]...)
+		out = append(out, prios[idx+1:]...)
+		return out
+	}
+	return prios
+}
+
+// run 执行一个优先级指定的合并任务
+func (lm *levelManager) run(id int, p compactionPriority) bool {
+	err := lm.doCompact(id, p)
+	switch err {
+	case nil:
+		return true
+	case utils.ErrFillTables:
+		// 什么也不做，此时合并过程被忽略
+	default:
+		log.Printf("[taskID:%d] While running doCompact: %v\n ", id, err)
+	}
+	return false
+}
+
+// doCompact 选择level的某些表合并到目标level
+func (lm *levelManager) doCompact(id int, p compactionPriority) error {
+	l := p.level
+	utils.CondPanic(l >= lm.opt.MaxLevelNum, errors.New("[doCompact] Sanity check. l >= lm.opt.MaxLevelNum")) // Sanity check.
+	if p.t.baseLevel == 0 {
+		p.t = lm.levelTargets()
+	}
+	// 创建真正的压缩计划
+	cd := compactDef{
+		compactorId:  id,
+		p:            p,
+		t:            p.t,
+		thisLevel:    lm.levels[l],
+		dropPrefixes: p.dropPrefixes,
+	}
+
+	// 如果是第0层 对齐单独填充处理
+	if l == 0 {
+		cd.nextLevel = lm.levels[p.t.baseLevel]
+		if !lm.fillTablesL0(&cd) {
+			return utils.ErrFillTables
+		}
+	} else {
+		cd.nextLevel = cd.thisLevel
+		// 如果不是最后一层，则压缩到下一层即可
+		if !cd.thisLevel.isLastLevel() {
+			cd.nextLevel = lm.levels[l+1]
+		}
+		if !lm.fillTables(&cd) {
+			return utils.ErrFillTables
+		}
+	}
+	// 完成合并后 从合并状态中删除
+	defer lm.compactState.delete(cd) // Remove the ranges from compaction status.
+
+	// 执行合并计划
+	if err := lm.runCompactDef(id, l, cd); err != nil {
+		// This compaction couldn't be done successfully.
+		log.Printf("[Compactor: %d] LOG Compact FAILED with error: %+v: %+v", id, err, cd)
+		return err
+	}
+
+	log.Printf("[Compactor: %d] Compaction for level: %d DONE", id, cd.thisLevel.levelNum)
+	return nil
+}
+
+// pickCompactLevel 选择合适的level执行合并，返回判断的优先级
+func (lm *levelManager) pickCompactLevels() (prios []compactionPriority) {
+	t := lm.levelTargets()
+	addPriority := func(level int, score float64) {
+		pri := compactionPriority{
+			level:    level,
+			score:    score,
+			adjusted: score,
+			t:        t,
+		}
+		prios = append(prios, pri)
+	}
+
+	// 根据l0表的table数量来对压缩提权
+	addPriority(0, float64(lm.levels[0].numTables())/float64(lm.opt.NumLevelZeroTables))
+
+	// 非l0 层都根据大小计算优先级
+	for i := 1; i < len(lm.levels); i++ {
+		// 处于压缩状态的sst 不能计算在内
+		delSize := lm.compactState.delSize(i)
+		l := lm.levels[i]
+		sz := l.getTotalSize() - delSize
+		// score的计算是 扣除正在合并的表后的尺寸与目标sz的比值
+		addPriority(i, float64(sz)/float64(t.targetSz[i]))
+	}
+	utils.CondPanic(len(prios) != len(lm.levels), errors.New("[pickCompactLevels] len(prios) != len(lm.levels)"))
+
+	// 调整得分
+	var prevLevel int
+	for level := t.baseLevel; level < len(lm.levels); level++ {
+		if prios[prevLevel].adjusted >= 1 {
+			// 避免过大的得分
+			const minScore = 0.01
+			if prios[level].score >= minScore {
+				prios[prevLevel].adjusted /= prios[level].adjusted
+			} else {
+				prios[prevLevel].adjusted /= minScore
+			}
+		}
+		prevLevel = level
+	}
+
+	// 仅选择得分大于1的压缩内容，并且允许l0到l0的特殊压缩，为了提升查询性能允许l0层独自压缩
+	out := prios[:0]
+	for _, p := range prios[:len(prios)-1] {
+		if p.score >= 1.0 {
+			out = append(out, p)
+		}
+	}
+	prios = out
+
+	// 按优先级排序
+	sort.Slice(prios, func(i, j int) bool {
+		return prios[i].adjusted > prios[j].adjusted
+	})
+	return prios
+}
+func (lm *levelManager) lastLevel() *levelHandler {
+	return lm.levels[len(lm.levels)-1]
+}
+
+// levelTargets
+func (lm *levelManager) levelTargets() targets {
+	adjust := func(sz int64) int64 {
+		if sz < lm.opt.BaseLevelSize {
+			return lm.opt.BaseLevelSize
+		}
+		return sz
+	}
+
+	// 初始化默认都是最大层级
+	t := targets{
+		targetSz: make([]int64, len(lm.levels)),
+		fileSz:   make([]int64, len(lm.levels)),
+	}
+	// 从最后一个level开始计算
+	dbSize := lm.lastLevel().getTotalSize()
+	for i := len(lm.levels) - 1; i > 0; i-- {
+		leveTargetSize := adjust(dbSize)
+		t.targetSz[i] = leveTargetSize
+		// 如果当前的level没有达到合并的要求
+		if t.baseLevel == 0 && leveTargetSize <= lm.opt.BaseLevelSize {
+			t.baseLevel = i
+		}
+		dbSize /= int64(lm.opt.LevelSizeMultiplier)
+	}
+
+	tsz := lm.opt.BaseTableSize
+	for i := 0; i < len(lm.levels); i++ {
+		if i == 0 {
+			// l0选择memtable的size作为文件的尺寸
+			t.fileSz[i] = lm.opt.MemTableSize
+		} else if i <= t.baseLevel {
+			t.fileSz[i] = tsz
+		} else {
+			tsz *= int64(lm.opt.TableSizeMultiplier)
+			t.fileSz[i] = tsz
+		}
+	}
+
+	// 找到最后一个空level作为目标level实现跨level归并，减少写放大
+	for i := t.baseLevel + 1; i < len(lm.levels)-1; i++ {
+		if lm.levels[i].getTotalSize() > 0 {
+			break
+		}
+		t.baseLevel = i
+	}
+
+	// 如果存在断层，则目标level++
+	b := t.baseLevel
+	lvl := lm.levels
+	if b < len(lvl)-1 && lvl[b].getTotalSize() == 0 && lvl[b+1].getTotalSize() < t.targetSz[b+1] {
+		t.baseLevel++
+	}
+	return t
+}
+
+type thisAndNextLevelRLocked struct{}
+
+func (lm *levelManager) fillTables(cd *compactDef) bool {
+	cd.lockLevels()
+	defer cd.unlockLevels()
+
+	tables := make([]*table, cd.thisLevel.numTables())
+	copy(tables, cd.thisLevel.tables)
+	if len(tables) == 0 {
+		return false
+	}
+	// We're doing a maxLevel to maxLevel compaction. Pick tables based on the stale data size.
+	if cd.thisLevel.isLastLevel() {
+		return lm.fillMaxLevelTables(tables, cd)
+	}
+	// We pick tables, so we compact older tables first. This is similar to
+	// kOldestLargestSeqFirst in RocksDB.
+	lm.sortByHeuristic(tables, cd)
+
+	for _, t := range tables {
+		cd.thisSize = t.Size()
+		cd.thisRange = getKeyRange(t)
+		// 如果被压缩过了，则什么都不需要做
+		if lm.compactState.overlapsWith(cd.thisLevel.levelNum, cd.thisRange) {
+			continue
+		}
+		cd.top = []*table{t}
+		left, right := cd.nextLevel.overlappingTables(levelHandlerRLocked{}, cd.thisRange)
+
+		cd.bot = make([]*table, right-left)
+		copy(cd.bot, cd.nextLevel.tables[left:right])
+
+		if len(cd.bot) == 0 {
+			cd.bot = []*table{}
+			cd.nextRange = cd.thisRange
+			if !lm.compactState.compareAndAdd(thisAndNextLevelRLocked{}, *cd) {
+				continue
+			}
+			return true
+		}
+		cd.nextRange = getKeyRange(cd.bot...)
+
+		if lm.compactState.overlapsWith(cd.nextLevel.levelNum, cd.nextRange) {
+			continue
+		}
+		if !lm.compactState.compareAndAdd(thisAndNextLevelRLocked{}, *cd) {
+			continue
+		}
+		return true
+	}
+	return false
+}
+
+// compact older tables first.
+func (lm *levelManager) sortByHeuristic(tables []*table, cd *compactDef) {
+	if len(tables) == 0 || cd.nextLevel == nil {
+		return
+	}
+
+	// Sort tables by max version. This is what RocksDB does.
+	sort.Slice(tables, func(i, j int) bool {
+		return tables[i].ss.Indexs().MaxVersion < tables[j].ss.Indexs().MaxVersion
+	})
+}
+func (lm *levelManager) runCompactDef(id, l int, cd compactDef) (err error) {
+	if len(cd.t.fileSz) == 0 {
+		return errors.New("Filesizes cannot be zero. Targets are not set")
+	}
+	timeStart := time.Now()
+
+	thisLevel := cd.thisLevel
+	nextLevel := cd.nextLevel
+
+	utils.CondPanic(len(cd.splits) != 0, errors.New("len(cd.splits) != 0"))
+	if thisLevel == nextLevel {
+		// l0 to l0 和 lmax to lmax 不做特殊处理
+	} else {
+		lm.addSplits(&cd)
+	}
+	// 追加一个空的
+	if len(cd.splits) == 0 {
+		cd.splits = append(cd.splits, keyRange{})
+	}
+
+	newTables, decr, err := lm.compactBuildTables(l, cd)
+	if err != nil {
+		return err
+	}
+	defer func() {
+		// Only assign to err, if it's not already nil.
+		if decErr := decr(); err == nil {
+			err = decErr
+		}
+	}()
+	changeSet := buildChangeSet(&cd, newTables)
+
+	// 删除之前先更新manifest文件
+	if err := lm.manifestFile.AddChanges(changeSet.Changes); err != nil {
+		return err
+	}
+
+	if err := nextLevel.replaceTables(cd.bot, newTables); err != nil {
+		return err
+	}
+	defer decrRefs(cd.top)
+	if err := thisLevel.deleteTables(cd.top); err != nil {
+		return err
+	}
+
+	from := append(tablesToString(cd.top), tablesToString(cd.bot)...)
+	to := tablesToString(newTables)
+	if dur := time.Since(timeStart); dur > 2*time.Second {
+		var expensive string
+		if dur > time.Second {
+			expensive = " [E]"
+		}
+		fmt.Printf("[%d]%s LOG Compact %d->%d (%d, %d -> %d tables with %d splits)."+
+			" [%s] -> [%s], took %v\n",
+			id, expensive, thisLevel.levelNum, nextLevel.levelNum, len(cd.top), len(cd.bot),
+			len(newTables), len(cd.splits), strings.Join(from, " "), strings.Join(to, " "),
+			dur.Round(time.Millisecond))
+	}
+	return nil
+}
+
+// tablesToString
+func tablesToString(tables []*table) []string {
+	var res []string
+	for _, t := range tables {
+		res = append(res, fmt.Sprintf("%05d", t.fid))
+	}
+	res = append(res, ".")
+	return res
+}
+
+// buildChangeSet _
+func buildChangeSet(cd *compactDef, newTables []*table) pb.ManifestChangeSet {
+	changes := []*pb.ManifestChange{}
+	for _, table := range newTables {
+		changes = append(changes, newCreateChange(table.fid, cd.nextLevel.levelNum))
+	}
+	for _, table := range cd.top {
+		changes = append(changes, newDeleteChange(table.fid))
+	}
+	for _, table := range cd.bot {
+		changes = append(changes, newDeleteChange(table.fid))
+	}
+	return pb.ManifestChangeSet{Changes: changes}
+}
+
+//
+func newDeleteChange(id uint64) *pb.ManifestChange {
+	return &pb.ManifestChange{
+		Id: id,
+		Op: pb.ManifestChange_DELETE,
+	}
+}
+
+// newCreateChange
+func newCreateChange(id uint64, level int) *pb.ManifestChange {
+	return &pb.ManifestChange{
+		Id:    id,
+		Op:    pb.ManifestChange_CREATE,
+		Level: uint32(level),
+	}
+}
+
+// compactBuildTables 合并两个层的sst文件
+func (lm *levelManager) compactBuildTables(lev int, cd compactDef) ([]*table, func() error, error) {
+
+	topTables := cd.top
+	botTables := cd.bot
+	iterOpt := &utils.Options{
+		IsAsc: true,
+	}
+	//numTables := int64(len(topTables) + len(botTables))
+	newIterator := func() []utils.Iterator {
+		// Create iterators across all the tables involved first.
+		var iters []utils.Iterator
+		switch {
+		case lev == 0:
+			iters = append(iters, iteratorsReversed(topTables, iterOpt)...)
+		case len(topTables) > 0:
+			iters = []utils.Iterator{topTables[0].NewIterator(iterOpt)}
+		}
+		return append(iters, NewConcatIterator(botTables, iterOpt))
+	}
+
+	// 开始并行执行压缩过程
+	res := make(chan *table, 3)
+	inflightBuilders := utils.NewThrottle(8 + len(cd.splits))
+	for _, kr := range cd.splits {
+		// Initiate Do here so we can register the goroutines for buildTables too.
+		if err := inflightBuilders.Do(); err != nil {
+			return nil, nil, fmt.Errorf("cannot start subcompaction: %+v", err)
+		}
+		// 开启一个协程去处理子压缩
+		go func(kr keyRange) {
+			defer inflightBuilders.Done(nil)
+			it := NewMergeIterator(newIterator(), false)
+			defer it.Close()
+			lm.subcompact(it, kr, cd, inflightBuilders, res)
+		}(kr)
+	}
+
+	// mapreduce的方式收集table的句柄
+	var newTables []*table
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		for t := range res {
+			newTables = append(newTables, t)
+		}
+	}()
+
+	// 在这里等待所有的压缩过程完成
+	err := inflightBuilders.Finish()
+	// channel 资源回收
+	close(res)
+	// 等待所有的builder刷到磁盘
+	wg.Wait()
+
+	if err == nil {
+		// 同步刷盘，保证数据一定落盘
+		err = utils.SyncDir(lm.opt.WorkDir)
+	}
+
+	if err != nil {
+		// 如果出现错误，则删除索引新创建的文件
+		_ = decrRefs(newTables)
+		return nil, nil, fmt.Errorf("while running compactions for: %+v, %v", cd, err)
+	}
+
+	sort.Slice(newTables, func(i, j int) bool {
+		return utils.CompareKeys(newTables[i].ss.MaxKey(), newTables[j].ss.MaxKey()) < 0
+	})
+	return newTables, func() error { return decrRefs(newTables) }, nil
+}
+
+// 并行的运行子压缩情况
+func (lm *levelManager) addSplits(cd *compactDef) {
+	cd.splits = cd.splits[:0]
+
+	// Let's say we have 10 tables in cd.bot and min width = 3. Then, we'll pick
+	// 0, 1, 2 (pick), 3, 4, 5 (pick), 6, 7, 8 (pick), 9 (pick, because last table).
+	// This gives us 4 picks for 10 tables.
+	// In an edge case, 142 tables in bottom led to 48 splits. That's too many splits, because it
+	// then uses up a lot of memory for table builder.
+	// We should keep it so we have at max 5 splits.
+	width := int(math.Ceil(float64(len(cd.bot)) / 5.0))
+	if width < 3 {
+		width = 3
+	}
+	skr := cd.thisRange
+	skr.extend(cd.nextRange)
+
+	addRange := func(right []byte) {
+		skr.right = utils.Copy(right)
+		cd.splits = append(cd.splits, skr)
+		skr.left = skr.right
+	}
+
+	for i, t := range cd.bot {
+		// last entry in bottom table.
+		if i == len(cd.bot)-1 {
+			addRange([]byte{})
+			return
+		}
+		if i%width == width-1 {
+			// 设置最大值为右区间
+			right := utils.KeyWithTs(utils.ParseKey(t.ss.MaxKey()), math.MaxUint64)
+			addRange(right)
+		}
+	}
+}
+
+// sortByStaleData 对表中陈旧数据的数量对sst文件进行排序
+func (lm *levelManager) sortByStaleDataSize(tables []*table, cd *compactDef) {
+	if len(tables) == 0 || cd.nextLevel == nil {
+		return
+	}
+	// TODO 统计一个 sst文件中陈旧数据的数量，涉及对存储格式的修改
+	sort.Slice(tables, func(i, j int) bool {
+		return tables[i].StaleDataSize() > tables[j].StaleDataSize()
+	})
+}
+
+// max level 和 max level 的压缩
+func (lm *levelManager) fillMaxLevelTables(tables []*table, cd *compactDef) bool {
+	sortedTables := make([]*table, len(tables))
+	copy(sortedTables, tables)
+	lm.sortByStaleDataSize(sortedTables, cd)
+
+	if len(sortedTables) > 0 && sortedTables[0].StaleDataSize() == 0 {
+		// This is a maxLevel to maxLevel compaction and we don't have any stale data.
+		return false
+	}
+	cd.bot = []*table{}
+	collectBotTables := func(t *table, needSz int64) {
+		totalSize := t.Size()
+
+		j := sort.Search(len(tables), func(i int) bool {
+			return utils.CompareKeys(tables[i].ss.MinKey(), t.ss.MinKey()) >= 0
+		})
+		utils.CondPanic(tables[j].fid != t.fid, errors.New("tables[j].ID() != t.ID()"))
+		j++
+		// Collect tables until we reach the the required size.
+		for j < len(tables) {
+			newT := tables[j]
+			totalSize += newT.Size()
+
+			if totalSize >= needSz {
+				break
+			}
+			cd.bot = append(cd.bot, newT)
+			cd.nextRange.extend(getKeyRange(newT))
+			j++
+		}
+	}
+	now := time.Now()
+	for _, t := range sortedTables {
+		if now.Sub(*t.GetCreatedAt()) < time.Hour {
+			// Just created it an hour ago. Don't pick for compaction.
+			continue
+		}
+		// If the stale data size is less than 10 MB, it might not be worth
+		// rewriting the table. Skip it.
+		if t.StaleDataSize() < 10<<20 {
+			continue
+		}
+
+		cd.thisSize = t.Size()
+		cd.thisRange = getKeyRange(t)
+		// Set the next range as the same as the current range. If we don't do
+		// this, we won't be able to run more than one max level compactions.
+		cd.nextRange = cd.thisRange
+		// If we're already compacting this range, don't do anything.
+		if lm.compactState.overlapsWith(cd.thisLevel.levelNum, cd.thisRange) {
+			continue
+		}
+
+		// Found a valid table!
+		cd.top = []*table{t}
+
+		needFileSz := cd.t.fileSz[cd.thisLevel.levelNum]
+		// 如果合并的sst size需要的文件尺寸直接终止
+		if t.Size() >= needFileSz {
+			break
+		}
+		// TableSize is less than what we want. Collect more tables for compaction.
+		// If the level has multiple small tables, we collect all of them
+		// together to form a bigger table.
+		collectBotTables(t, needFileSz)
+		if !lm.compactState.compareAndAdd(thisAndNextLevelRLocked{}, *cd) {
+			cd.bot = cd.bot[:0]
+			cd.nextRange = keyRange{}
+			continue
+		}
+		return true
+	}
+	if len(cd.top) == 0 {
+		return false
+	}
+
+	return lm.compactState.compareAndAdd(thisAndNextLevelRLocked{}, *cd)
+}
+
+// fillTablesL0 先尝试从l0 到lbase的压缩，如果失败则对l0自己压缩
+func (lm *levelManager) fillTablesL0(cd *compactDef) bool {
+	if ok := lm.fillTablesL0ToLbase(cd); ok {
+		return true
+	}
+	return lm.fillTablesL0ToL0(cd)
+}
+
+func (lm *levelManager) fillTablesL0ToLbase(cd *compactDef) bool {
+	if cd.nextLevel.levelNum == 0 {
+		utils.Panic(errors.New("base level can be zero"))
+	}
+	// 如果优先级低于1 则不执行
+	if cd.p.adjusted > 0.0 && cd.p.adjusted < 1.0 {
+		// Do not compact to Lbase if adjusted score is less than 1.0.
+		return false
+	}
+	cd.lockLevels()
+	defer cd.unlockLevels()
+
+	top := cd.thisLevel.tables
+	if len(top) == 0 {
+		return false
+	}
+
+	var out []*table
+	var kr keyRange
+	// cd.top[0] 是最老的文件，从最老的文件开始
+	for _, t := range top {
+		dkr := getKeyRange(t)
+		if kr.overlapsWith(dkr) {
+			out = append(out, t)
+			kr.extend(dkr)
+		} else {
+			// 如果有任何一个不重合的区间存在则直接终止
+			break
+		}
+	}
+	// 获取目标range list 的全局 range 对象
+	cd.thisRange = getKeyRange(out...)
+	cd.top = out
+
+	left, right := cd.nextLevel.overlappingTables(levelHandlerRLocked{}, cd.thisRange)
+	cd.bot = make([]*table, right-left)
+	copy(cd.bot, cd.nextLevel.tables[left:right])
+
+	if len(cd.bot) == 0 {
+		cd.nextRange = cd.thisRange
+	} else {
+		cd.nextRange = getKeyRange(cd.bot...)
+	}
+	return lm.compactState.compareAndAdd(thisAndNextLevelRLocked{}, *cd)
+}
+
+// fillTablesL0ToL0 l0到l0压缩
+func (lm *levelManager) fillTablesL0ToL0(cd *compactDef) bool {
+	if cd.compactorId != 0 {
+		// 只要0号压缩处理器可以执行，避免l0tol0的资源竞争
+		return false
+	}
+
+	cd.nextLevel = lm.levels[0]
+	cd.nextRange = keyRange{}
+	cd.bot = nil
+
+	//  TODO 这里是否会导致死锁？
+	utils.CondPanic(cd.thisLevel.levelNum != 0, errors.New("cd.thisLevel.levelNum != 0"))
+	utils.CondPanic(cd.nextLevel.levelNum != 0, errors.New("cd.nextLevel.levelNum != 0"))
+	lm.levels[0].RLock()
+	defer lm.levels[0].RUnlock()
+
+	lm.compactState.Lock()
+	defer lm.compactState.Unlock()
+
+	top := cd.thisLevel.tables
+	var out []*table
+	now := time.Now()
+	for _, t := range top {
+		if t.Size() >= 2*cd.t.fileSz[0] {
+			// 在L0 to L0 的压缩过程中，不要对过大的sst文件压缩，这会造成性能抖动
+			continue
+		}
+		if now.Sub(*t.GetCreatedAt()) < 10*time.Second {
+			// 如果sst的创建时间不足10s 也不要回收
+			continue
+		}
+		// 如果当前的sst 已经在压缩状态 也应该忽略
+		if _, beingCompacted := lm.compactState.tables[t.fid]; beingCompacted {
+			continue
+		}
+		out = append(out, t)
+	}
+
+	if len(out) < 4 {
+		// 满足条件的sst小于4个那就不压缩了
+		return false
+	}
+	cd.thisRange = infRange
+	cd.top = out
+
+	// 在这个过程中避免任何l0到其他层的合并
+	thisLevel := lm.compactState.levels[cd.thisLevel.levelNum]
+	thisLevel.ranges = append(thisLevel.ranges, infRange)
+	for _, t := range out {
+		lm.compactState.tables[t.fid] = struct{}{}
+	}
+
+	//  l0 to l0的压缩最终都会压缩为一个文件，这大大减少了l0层文件数量，减少了读放大
+	cd.t.fileSz[0] = math.MaxUint32
+	return true
+}
+
+// getKeyRange 返回一组sst的区间合并后的最大与最小值
+func getKeyRange(tables ...*table) keyRange {
+	if len(tables) == 0 {
+		return keyRange{}
+	}
+	minKey := tables[0].ss.MinKey()
+	maxKey := tables[0].ss.MaxKey()
+	for i := 1; i < len(tables); i++ {
+		if utils.CompareKeys(tables[i].ss.MinKey(), minKey) < 0 {
+			minKey = tables[i].ss.MinKey()
+		}
+		if utils.CompareKeys(tables[i].ss.MaxKey(), maxKey) > 0 {
+			maxKey = tables[i].ss.MaxKey()
+		}
+	}
+
+	// We pick all the versions of the smallest and the biggest key. Note that version zero would
+	// be the rightmost key, considering versions are default sorted in descending order.
+	return keyRange{
+		left:  utils.KeyWithTs(utils.ParseKey(minKey), math.MaxUint64),
+		right: utils.KeyWithTs(utils.ParseKey(maxKey), 0),
+	}
+}
+
+func iteratorsReversed(th []*table, opt *utils.Options) []utils.Iterator {
+	out := make([]utils.Iterator, 0, len(th))
+	for i := len(th) - 1; i >= 0; i-- {
+		// This will increment the reference of the table handler.
+		out = append(out, th[i].NewIterator(opt))
+	}
+	return out
+}
+func (lm *levelManager) updateDiscardStats(discardStats map[uint32]int64) {
+	select {
+	case *lm.lsm.option.DiscardStatsCh <- discardStats:
+	default:
+	}
+}
+
+// 真正执行并行压缩的子压缩文件
+func (lm *levelManager) subcompact(it utils.Iterator, kr keyRange, cd compactDef,
+	inflightBuilders *utils.Throttle, res chan<- *table) {
+	var lastKey []byte
+	// 更新 discardStats
+	discardStats := make(map[uint32]int64)
+	defer func() {
+		lm.updateDiscardStats(discardStats)
+	}()
+	updateStats := func(e *utils.Entry) {
+		if e.Meta&utils.BitValuePointer > 0 {
+			var vp utils.ValuePtr
+			vp.Decode(e.Value)
+			discardStats[vp.Fid] += int64(vp.Len)
+		}
+	}
+	addKeys := func(builder *tableBuilder) {
+		var tableKr keyRange
+		for ; it.Valid(); it.Next() {
+			key := it.Item().Entry().Key
+			//version := utils.ParseTs(key)
+			isExpired := isDeletedOrExpired(0, it.Item().Entry().ExpiresAt)
+			if !utils.SameKey(key, lastKey) {
+				// 如果迭代器返回的key大于当前key的范围就不用执行了
+				if len(kr.right) > 0 && utils.CompareKeys(key, kr.right) >= 0 {
+					break
+				}
+				if builder.ReachedCapacity() {
+					// 如果超过预估的sst文件大小，则直接结束
+					break
+				}
+				// 把当前的key变为 lastKey
+				lastKey = utils.SafeCopy(lastKey, key)
+				//umVersions = 0
+				// 如果左边界没有，则当前key给到左边界
+				if len(tableKr.left) == 0 {
+					tableKr.left = utils.SafeCopy(tableKr.left, key)
+				}
+				// 更新右边界
+				tableKr.right = lastKey
+			}
+			// TODO 这里要区分值的指针
+			// 判断是否是过期内容，是的话就删除
+			switch {
+			case isExpired:
+				updateStats(it.Item().Entry())
+				builder.AddStaleKey(it.Item().Entry())
+			default:
+				builder.AddKey(it.Item().Entry())
+			}
+		}
+	} // End of function: addKeys
+
+	//如果 key range left还存在 则seek到这里 说明遍历中途停止了
+	if len(kr.left) > 0 {
+		it.Seek(kr.left)
+	} else {
+		//
+		it.Rewind()
+	}
+	for it.Valid() {
+		key := it.Item().Entry().Key
+		if len(kr.right) > 0 && utils.CompareKeys(key, kr.right) >= 0 {
+			break
+		}
+		// 拼装table创建的参数
+		// TODO 这里可能要大改，对open table的参数复制一份opt
+		builder := newTableBuilerWithSSTSize(lm.opt, cd.t.fileSz[cd.nextLevel.levelNum])
+
+		// This would do the iteration and add keys to builder.
+		addKeys(builder)
+
+		// It was true that it.Valid() at least once in the loop above, which means we
+		// called Add() at least once, and builder is not Empty().
+		if builder.empty() {
+			// Cleanup builder resources:
+			builder.finish()
+			builder.Close()
+			continue
+		}
+		if err := inflightBuilders.Do(); err != nil {
+			// Can't return from here, until I decrRef all the tables that I built so far.
+			break
+		}
+		// 充分发挥 ssd的并行 写入特性
+		go func(builder *tableBuilder) {
+			defer inflightBuilders.Done(nil)
+			defer builder.Close()
+			var tbl *table
+			newFID := atomic.AddUint64(&lm.maxFID, 1) // compact的时候是没有memtable的，这里自增maxFID即可。
+			// TODO 这里的sst文件需要根据level大小变化
+			sstName := utils.FileNameSSTable(lm.opt.WorkDir, newFID)
+			tbl = openTable(lm, sstName, builder)
+			if tbl == nil {
+				return
+			}
+			res <- tbl
+		}(builder)
+	}
+}
+
+// checkOverlap 检查是否与下一层存在重合
+func (lm *levelManager) checkOverlap(tables []*table, lev int) bool {
+	kr := getKeyRange(tables...)
+	for i, lh := range lm.levels {
+		if i < lev { // Skip upper levels.
+			continue
+		}
+		lh.RLock()
+		left, right := lh.overlappingTables(levelHandlerRLocked{}, kr)
+		lh.RUnlock()
+		if right-left > 0 {
+			return true
+		}
+	}
+	return false
+}
+
+// 判断是否过期 是可删除
+func isDeletedOrExpired(meta byte, expiresAt uint64) bool {
+	if expiresAt == 0 {
+		return false
+	}
+	return expiresAt <= uint64(time.Now().Unix())
+}
+
+// compactStatus
+type compactStatus struct {
+	sync.RWMutex
+	levels []*levelCompactStatus
+	tables map[uint64]struct{}
+}
+
+func (lsm *LSM) newCompactStatus() *compactStatus {
+	cs := &compactStatus{
+		levels: make([]*levelCompactStatus, 0),
+		tables: make(map[uint64]struct{}),
+	}
+	for i := 0; i < lsm.option.MaxLevelNum; i++ {
+		cs.levels = append(cs.levels, &levelCompactStatus{})
+	}
+	return cs
+}
+
+func (cs *compactStatus) overlapsWith(level int, this keyRange) bool {
+	cs.RLock()
+	defer cs.RUnlock()
+
+	thisLevel := cs.levels[level]
+	return thisLevel.overlapsWith(this)
+}
+
+func (cs *compactStatus) delSize(l int) int64 {
+	cs.RLock()
+	defer cs.RUnlock()
+	return cs.levels[l].delSize
+}
+
+func (cs *compactStatus) delete(cd compactDef) {
+	cs.Lock()
+	defer cs.Unlock()
+
+	tl := cd.thisLevel.levelNum
+
+	thisLevel := cs.levels[cd.thisLevel.levelNum]
+	nextLevel := cs.levels[cd.nextLevel.levelNum]
+
+	thisLevel.delSize -= cd.thisSize
+	found := thisLevel.remove(cd.thisRange)
+	// The following check makes sense only if we're compacting more than one
+	// table. In case of the max level, we might rewrite a single table to
+	// remove stale data.
+	if cd.thisLevel != cd.nextLevel && !cd.nextRange.isEmpty() {
+		found = nextLevel.remove(cd.nextRange) && found
+	}
+
+	if !found {
+		this := cd.thisRange
+		next := cd.nextRange
+		fmt.Printf("Looking for: %s in this level %d.\n", this, tl)
+		fmt.Printf("This Level:\n%s\n", thisLevel.debug())
+		fmt.Println()
+		fmt.Printf("Looking for: %s in next level %d.\n", next, cd.nextLevel.levelNum)
+		fmt.Printf("Next Level:\n%s\n", nextLevel.debug())
+		log.Fatal("keyRange not found")
+	}
+	for _, t := range append(cd.top, cd.bot...) {
+		_, ok := cs.tables[t.fid]
+		utils.CondPanic(!ok, fmt.Errorf("cs.tables is nil"))
+		delete(cs.tables, t.fid)
+	}
+}
+
+func (cs *compactStatus) compareAndAdd(_ thisAndNextLevelRLocked, cd compactDef) bool {
+	cs.Lock()
+	defer cs.Unlock()
+
+	tl := cd.thisLevel.levelNum
+	utils.CondPanic(tl >= len(cs.levels), fmt.Errorf("Got level %d. Max levels: %d", tl, len(cs.levels)))
+	thisLevel := cs.levels[cd.thisLevel.levelNum]
+	nextLevel := cs.levels[cd.nextLevel.levelNum]
+
+	if thisLevel.overlapsWith(cd.thisRange) {
+		return false
+	}
+	if nextLevel.overlapsWith(cd.nextRange) {
+		return false
+	}
+	// Check whether this level really needs compaction or not. Otherwise, we'll end up
+	// running parallel compactions for the same level.
+	// Update: We should not be checking size here. Compaction priority already did the size checks.
+	// Here we should just be executing the wish of others.
+
+	thisLevel.ranges = append(thisLevel.ranges, cd.thisRange)
+	nextLevel.ranges = append(nextLevel.ranges, cd.nextRange)
+	thisLevel.delSize += cd.thisSize
+	for _, t := range append(cd.top, cd.bot...) {
+		cs.tables[t.fid] = struct{}{}
+	}
+	return true
+}
+
+// levelCompactStatus
+type levelCompactStatus struct {
+	ranges  []keyRange
+	delSize int64
+}
+
+func (lcs *levelCompactStatus) overlapsWith(dst keyRange) bool {
+	for _, r := range lcs.ranges {
+		if r.overlapsWith(dst) {
+			return true
+		}
+	}
+	return false
+}
+func (lcs *levelCompactStatus) remove(dst keyRange) bool {
+	final := lcs.ranges[:0]
+	var found bool
+	for _, r := range lcs.ranges {
+		if !r.equals(dst) {
+			final = append(final, r)
+		} else {
+			found = true
+		}
+	}
+	lcs.ranges = final
+	return found
+}
+
+func (lcs *levelCompactStatus) debug() string {
+	var b bytes.Buffer
+	for _, r := range lcs.ranges {
+		b.WriteString(r.String())
+	}
+	return b.String()
+}
+
+// keyRange
+type keyRange struct {
+	left  []byte
+	right []byte
+	inf   bool
+	size  int64 // size is used for Key splits.
+}
+
+func (r keyRange) isEmpty() bool {
+	return len(r.left) == 0 && len(r.right) == 0 && !r.inf
+}
+
+var infRange = keyRange{inf: true}
+
+func (r keyRange) String() string {
+	return fmt.Sprintf("[left=%x, right=%x, inf=%v]", r.left, r.right, r.inf)
+}
+
+func (r keyRange) equals(dst keyRange) bool {
+	return bytes.Equal(r.left, dst.left) &&
+		bytes.Equal(r.right, dst.right) &&
+		r.inf == dst.inf
+}
+
+func (r *keyRange) extend(kr keyRange) {
+	// TODO(ibrahim): Is this needed?
+	if kr.isEmpty() {
+		return
+	}
+	if r.isEmpty() {
+		*r = kr
+	}
+	if len(r.left) == 0 || utils.CompareKeys(kr.left, r.left) < 0 {
+		r.left = kr.left
+	}
+	if len(r.right) == 0 || utils.CompareKeys(kr.right, r.right) > 0 {
+		r.right = kr.right
+	}
+	if kr.inf {
+		r.inf = true
+	}
+}
+
+func (r keyRange) overlapsWith(dst keyRange) bool {
+	// Empty keyRange always overlaps.
+	if r.isEmpty() {
+		return true
+	}
+	// TODO(ibrahim): Do you need this?
+	// Empty dst doesn't overlap with anything.
+	if dst.isEmpty() {
+		return false
+	}
+	if r.inf || dst.inf {
+		return true
+	}
+
+	// [dst.left, dst.right] ... [r.left, r.right]
+	// If my left is greater than dst right, we have no overlap.
+	if utils.CompareKeys(r.left, dst.right) > 0 {
+		return false
+	}
+	// [r.left, r.right] ... [dst.left, dst.right]
+	// If my right is less than dst left, we have no overlap.
+	if utils.CompareKeys(r.right, dst.left) < 0 {
+		return false
+	}
+	// We have overlap.
+	return true
+}
diff --git a/lsm/iterator.go b/lsm/iterator.go
index e6010ec..dcb515a 100644
--- a/lsm/iterator.go
+++ b/lsm/iterator.go
@@ -1,32 +1,49 @@
+// Copyright 2021 logicrec Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package lsm
 
 import (
-	"github.com/hardcore-os/corekv/iterator"
-	"github.com/hardcore-os/corekv/utils/codec"
+	"bytes"
+	"fmt"
+	"sort"
+
+	"github.com/hardcore-os/corekv/utils"
 )
 
 type Iterator struct {
-	it    iterator.Item
-	iters []iterator.Iterator
+	it    Item
+	iters []utils.Iterator
 }
 type Item struct {
-	e *codec.Entry
+	e *utils.Entry
 }
 
-func (it *Item) Entry() *codec.Entry {
+func (it *Item) Entry() *utils.Entry {
 	return it.e
 }
 
 // 创建迭代器
-func (lsm *LSM) NewIterator(opt *iterator.Options) iterator.Iterator {
+func (lsm *LSM) NewIterators(opt *utils.Options) []utils.Iterator {
 	iter := &Iterator{}
-	iter.iters = make([]iterator.Iterator, 0)
+	iter.iters = make([]utils.Iterator, 0)
 	iter.iters = append(iter.iters, lsm.memTable.NewIterator(opt))
 	for _, imm := range lsm.immutables {
 		iter.iters = append(iter.iters, imm.NewIterator(opt))
 	}
-	iter.iters = append(iter.iters, lsm.levels.NewIterator(opt))
-	return iter
+	iter.iters = append(iter.iters, lsm.levels.iterators()...)
+	return iter.iters
 }
 func (iter *Iterator) Next() {
 	iter.iters[0].Next()
@@ -37,19 +54,22 @@ func (iter *Iterator) Valid() bool {
 func (iter *Iterator) Rewind() {
 	iter.iters[0].Rewind()
 }
-func (iter *Iterator) Item() iterator.Item {
+func (iter *Iterator) Item() utils.Item {
 	return iter.iters[0].Item()
 }
 func (iter *Iterator) Close() error {
 	return nil
 }
 
+func (iter *Iterator) Seek(key []byte) {
+}
+
 // 内存表迭代器
 type memIterator struct {
-	innerIter iterator.Iterator
+	innerIter utils.Iterator
 }
 
-func (m *memTable) NewIterator(opt *iterator.Options) iterator.Iterator {
+func (m *memTable) NewIterator(opt *utils.Options) utils.Iterator {
 	return &memIterator{innerIter: m.sl.NewSkipListIterator()}
 }
 func (iter *memIterator) Next() {
@@ -61,21 +81,23 @@ func (iter *memIterator) Valid() bool {
 func (iter *memIterator) Rewind() {
 	iter.innerIter.Rewind()
 }
-func (iter *memIterator) Item() iterator.Item {
+func (iter *memIterator) Item() utils.Item {
 	return iter.innerIter.Item()
 }
 func (iter *memIterator) Close() error {
 	return iter.innerIter.Close()
 }
+func (iter *memIterator) Seek(key []byte) {
+}
 
 // levelManager上的迭代器
 type levelIterator struct {
-	it    *iterator.Item
+	it    *utils.Item
 	iters []*Iterator
 }
 
-func (lm *levelManager) NewIterator(options *iterator.Options) iterator.Iterator {
-	return &levelIterator{}
+func (lm *levelManager) NewIterators(options *utils.Options) []utils.Iterator {
+	return lm.iterators()
 }
 func (iter *levelIterator) Next() {
 }
@@ -85,9 +107,333 @@ func (iter *levelIterator) Valid() bool {
 func (iter *levelIterator) Rewind() {
 
 }
-func (iter *levelIterator) Item() iterator.Item {
+func (iter *levelIterator) Item() utils.Item {
 	return &Item{}
 }
 func (iter *levelIterator) Close() error {
 	return nil
 }
+
+func (iter *levelIterator) Seek(key []byte) {
+}
+
+// ConcatIterator 将table 数组链接成一个迭代器，这样迭代效率更高
+type ConcatIterator struct {
+	idx     int // Which iterator is active now.
+	cur     utils.Iterator
+	iters   []utils.Iterator // Corresponds to tables.
+	tables  []*table         // Disregarding reversed, this is in ascending order.
+	options *utils.Options   // Valid options are REVERSED and NOCACHE.
+}
+
+// NewConcatIterator creates a new concatenated iterator
+func NewConcatIterator(tbls []*table, opt *utils.Options) *ConcatIterator {
+	iters := make([]utils.Iterator, len(tbls))
+	return &ConcatIterator{
+		options: opt,
+		iters:   iters,
+		tables:  tbls,
+		idx:     -1, // Not really necessary because s.it.Valid()=false, but good to have.
+	}
+}
+
+func (s *ConcatIterator) setIdx(idx int) {
+	s.idx = idx
+	if idx < 0 || idx >= len(s.iters) {
+		s.cur = nil
+		return
+	}
+	if s.iters[idx] == nil {
+		s.iters[idx] = s.tables[idx].NewIterator(s.options)
+	}
+	s.cur = s.iters[s.idx]
+}
+
+// Rewind implements Interface
+func (s *ConcatIterator) Rewind() {
+	if len(s.iters) == 0 {
+		return
+	}
+	if !s.options.IsAsc {
+		s.setIdx(0)
+	} else {
+		s.setIdx(len(s.iters) - 1)
+	}
+	s.cur.Rewind()
+}
+
+// Valid implements y.Interface
+func (s *ConcatIterator) Valid() bool {
+	return s.cur != nil && s.cur.Valid()
+}
+
+// Item _
+func (s *ConcatIterator) Item() utils.Item {
+	return s.cur.Item()
+}
+
+// Seek brings us to element >= key if reversed is false. Otherwise, <= key.
+func (s *ConcatIterator) Seek(key []byte) {
+	var idx int
+	if s.options.IsAsc {
+		idx = sort.Search(len(s.tables), func(i int) bool {
+			return utils.CompareKeys(s.tables[i].ss.MaxKey(), key) >= 0
+		})
+	} else {
+		n := len(s.tables)
+		idx = n - 1 - sort.Search(n, func(i int) bool {
+			return utils.CompareKeys(s.tables[n-1-i].ss.MinKey(), key) <= 0
+		})
+	}
+	if idx >= len(s.tables) || idx < 0 {
+		s.setIdx(-1)
+		return
+	}
+	// For reversed=false, we know s.tables[i-1].Biggest() < key. Thus, the
+	// previous table cannot possibly contain key.
+	s.setIdx(idx)
+	s.cur.Seek(key)
+}
+
+// Next advances our concat iterator.
+func (s *ConcatIterator) Next() {
+	s.cur.Next()
+	if s.cur.Valid() {
+		// Nothing to do. Just stay with the current table.
+		return
+	}
+	for { // In case there are empty tables.
+		if !s.options.IsAsc {
+			s.setIdx(s.idx + 1)
+		} else {
+			s.setIdx(s.idx - 1)
+		}
+		if s.cur == nil {
+			// End of list. Valid will become false.
+			return
+		}
+		s.cur.Rewind()
+		if s.cur.Valid() {
+			break
+		}
+	}
+}
+
+// Close implements y.Interface.
+func (s *ConcatIterator) Close() error {
+	for _, it := range s.iters {
+		if it == nil {
+			continue
+		}
+		if err := it.Close(); err != nil {
+			return fmt.Errorf("ConcatIterator:%+v", err)
+		}
+	}
+	return nil
+}
+
+// MergeIterator 多路合并迭代器
+// NOTE: MergeIterator owns the array of iterators and is responsible for closing them.
+type MergeIterator struct {
+	left  node
+	right node
+	small *node
+
+	curKey  []byte
+	reverse bool
+}
+
+type node struct {
+	valid bool
+	entry *utils.Entry
+	iter  utils.Iterator
+
+	// The two iterators are type asserted from `y.Iterator`, used to inline more function calls.
+	// Calling functions on concrete types is much faster (about 25-30%) than calling the
+	// interface's function.
+	merge  *MergeIterator
+	concat *ConcatIterator
+}
+
+func (n *node) setIterator(iter utils.Iterator) {
+	n.iter = iter
+	// It's okay if the type assertion below fails and n.merge/n.concat are set to nil.
+	// We handle the nil values of merge and concat in all the methods.
+	n.merge, _ = iter.(*MergeIterator)
+	n.concat, _ = iter.(*ConcatIterator)
+}
+
+func (n *node) setKey() {
+	switch {
+	case n.merge != nil:
+		n.valid = n.merge.small.valid
+		if n.valid {
+			n.entry = n.merge.small.entry
+		}
+	case n.concat != nil:
+		n.valid = n.concat.Valid()
+		if n.valid {
+			n.entry = n.concat.Item().Entry()
+		}
+	default:
+		n.valid = n.iter.Valid()
+		if n.valid {
+			n.entry = n.iter.Item().Entry()
+		}
+	}
+}
+
+func (n *node) next() {
+	switch {
+	case n.merge != nil:
+		n.merge.Next()
+	case n.concat != nil:
+		n.concat.Next()
+	default:
+		n.iter.Next()
+	}
+	n.setKey()
+}
+
+func (n *node) rewind() {
+	n.iter.Rewind()
+	n.setKey()
+}
+
+func (n *node) seek(key []byte) {
+	n.iter.Seek(key)
+	n.setKey()
+}
+
+func (mi *MergeIterator) fix() {
+	if !mi.bigger().valid {
+		return
+	}
+	if !mi.small.valid {
+		mi.swapSmall()
+		return
+	}
+	cmp := utils.CompareKeys(mi.small.entry.Key, mi.bigger().entry.Key)
+	switch {
+	case cmp == 0: // Both the keys are equal.
+		// In case of same keys, move the right iterator ahead.
+		mi.right.next()
+		if &mi.right == mi.small {
+			mi.swapSmall()
+		}
+		return
+	case cmp < 0: // Small is less than bigger().
+		if mi.reverse {
+			mi.swapSmall()
+		} else {
+			// we don't need to do anything. Small already points to the smallest.
+		}
+		return
+	default: // bigger() is less than small.
+		if mi.reverse {
+			// Do nothing since we're iterating in reverse. Small currently points to
+			// the bigger key and that's okay in reverse iteration.
+		} else {
+			mi.swapSmall()
+		}
+		return
+	}
+}
+
+func (mi *MergeIterator) bigger() *node {
+	if mi.small == &mi.left {
+		return &mi.right
+	}
+	return &mi.left
+}
+
+func (mi *MergeIterator) swapSmall() {
+	if mi.small == &mi.left {
+		mi.small = &mi.right
+		return
+	}
+	if mi.small == &mi.right {
+		mi.small = &mi.left
+		return
+	}
+}
+
+// Next returns the next element. If it is the same as the current key, ignore it.
+func (mi *MergeIterator) Next() {
+	for mi.Valid() {
+		if !bytes.Equal(mi.small.entry.Key, mi.curKey) {
+			break
+		}
+		mi.small.next()
+		mi.fix()
+	}
+	mi.setCurrent()
+}
+
+func (mi *MergeIterator) setCurrent() {
+	utils.CondPanic(mi.small.entry == nil && mi.small.valid == true, fmt.Errorf("mi.small.entry is nil"))
+	if mi.small.valid {
+		mi.curKey = append(mi.curKey[:0], mi.small.entry.Key...)
+	}
+}
+
+// Rewind seeks to first element (or last element for reverse iterator).
+func (mi *MergeIterator) Rewind() {
+	mi.left.rewind()
+	mi.right.rewind()
+	mi.fix()
+	mi.setCurrent()
+}
+
+// Seek brings us to element with key >= given key.
+func (mi *MergeIterator) Seek(key []byte) {
+	mi.left.seek(key)
+	mi.right.seek(key)
+	mi.fix()
+	mi.setCurrent()
+}
+
+// Valid returns whether the MergeIterator is at a valid element.
+func (mi *MergeIterator) Valid() bool {
+	return mi.small.valid
+}
+
+// Key returns the key associated with the current iterator.
+func (mi *MergeIterator) Item() utils.Item {
+	return mi.small.iter.Item()
+}
+
+// Close implements Iterator.
+func (mi *MergeIterator) Close() error {
+	err1 := mi.left.iter.Close()
+	err2 := mi.right.iter.Close()
+	if err1 != nil {
+		return utils.WarpErr("MergeIterator", err1)
+	}
+	return utils.WarpErr("MergeIterator", err2)
+}
+
+// NewMergeIterator creates a merge iterator.
+func NewMergeIterator(iters []utils.Iterator, reverse bool) utils.Iterator {
+	switch len(iters) {
+	case 0:
+		return &Iterator{}
+	case 1:
+		return iters[0]
+	case 2:
+		mi := &MergeIterator{
+			reverse: reverse,
+		}
+		mi.left.setIterator(iters[0])
+		mi.right.setIterator(iters[1])
+		// Assign left iterator randomly. This will be fixed when user calls rewind/seek.
+		mi.small = &mi.left
+		return mi
+	}
+	mid := len(iters) / 2
+	return NewMergeIterator(
+		[]utils.Iterator{
+			NewMergeIterator(iters[:mid], reverse),
+			NewMergeIterator(iters[mid:], reverse),
+		}, reverse)
+}
diff --git a/lsm/levels.go b/lsm/levels.go
index 9125ee7..4f8bf41 100644
--- a/lsm/levels.go
+++ b/lsm/levels.go
@@ -1,41 +1,43 @@
 package lsm
 
 import (
+	"bytes"
+	"sort"
+	"sync"
+	"sync/atomic"
+
 	"github.com/hardcore-os/corekv/file"
 	"github.com/hardcore-os/corekv/utils"
-	"github.com/hardcore-os/corekv/utils/codec"
 )
 
-type levelManager struct {
-	opt      *Options
-	cache    *cache
-	manifest *file.Manifest
-	levels   []*levelHandler
-}
-
-type levelHandler struct {
-	levelNum int
-	tables   []*table
+// initLevelManager 初始化函数
+func (lsm *LSM) initLevelManager(opt *Options) *levelManager {
+	lm := &levelManager{lsm: lsm} // 反引用
+	lm.compactState = lsm.newCompactStatus()
+	lm.opt = opt
+	// 读取manifest文件构建管理器
+	if err := lm.loadManifest(); err != nil {
+		panic(err)
+	}
+	lm.build()
+	return lm
 }
 
-func (lh *levelHandler) close() error {
-	return nil
+type levelManager struct {
+	maxFID       uint64 // 已经分配出去的最大fid，只要创建了memtable 就算已分配
+	opt          *Options
+	cache        *cache
+	manifestFile *file.ManifestFile
+	levels       []*levelHandler
+	lsm          *LSM
+	compactState *compactStatus
 }
 
-func (lh *levelHandler) Get(key []byte) (*codec.Entry, error) {
-	// 如果是第0层文件则进行特殊处理
-	if lh.levelNum == 0 {
-		// logic...
-	} else {
-		// logic...
-	}
-	return nil, nil
-}
 func (lm *levelManager) close() error {
 	if err := lm.cache.close(); err != nil {
 		return err
 	}
-	if err := lm.manifest.Close(); err != nil {
+	if err := lm.manifestFile.Close(); err != nil {
 		return err
 	}
 	for i := range lm.levels {
@@ -46,51 +48,18 @@ func (lm *levelManager) close() error {
 	return nil
 }
 
-func newLevelManager(opt *Options) *levelManager {
-	lm := &levelManager{}
-	lm.opt = opt
-	// 读取manifest文件构建管理器
-	lm.loadManifest()
-	lm.build()
-	return lm
-}
-func (lm *levelManager) loadCache() {
-	lm.cache = newCache(lm.opt)
-	// 添加 idx cache
+func (lm *levelManager) iterators() []utils.Iterator {
+
+	itrs := make([]utils.Iterator, 0, len(lm.levels))
 	for _, level := range lm.levels {
-		for _, table := range level.tables {
-			lm.cache.addIndex(table.ss.FID(), table)
-		}
-	}
-}
-func (lm *levelManager) loadManifest() {
-	lm.manifest = file.OpenManifest(&file.Options{Name: "manifest", Dir: lm.opt.WorkDir})
-}
-func (lm *levelManager) build() {
-	// 如果manifest文件是空的 则进行初始化
-	lm.levels = make([]*levelHandler, utils.MaxLevelNum)
-	tables := lm.manifest.Tables()
-	for num := 0; num < utils.MaxLevelNum; num++ {
-		lm.levels[num] = &levelHandler{levelNum: num}
-		lm.levels[num].tables = make([]*table, len(tables[num]))
-		for i := range tables[num] {
-			lm.levels[num].tables[i] = openTable(lm.opt, tables[num][i])
-		}
+		itrs = append(itrs, level.iterators()...)
 	}
-	// 逐一加载sstable 的index block 构建cache
-	lm.loadCache()
+	return itrs
 }
 
-// 向L0层flush一个sstable
-func (lm *levelManager) flush(immutable *memTable) error {
-	// flush 跳表中的数据转化为sst文件
-	// 删除wal文件并创建一个新的wal文件
-	return nil
-}
-
-func (lm *levelManager) Get(key []byte) (*codec.Entry, error) {
+func (lm *levelManager) Get(key []byte) (*utils.Entry, error) {
 	var (
-		entry *codec.Entry
+		entry *utils.Entry
 		err   error
 	)
 	// L0层查询
@@ -98,11 +67,290 @@ func (lm *levelManager) Get(key []byte) (*codec.Entry, error) {
 		return entry, err
 	}
 	// L1-7层查询
-	for level := 1; level < utils.MaxLevelNum; level++ {
+	for level := 1; level < lm.opt.MaxLevelNum; level++ {
 		ld := lm.levels[level]
 		if entry, err = ld.Get(key); entry != nil {
 			return entry, err
 		}
 	}
-	return entry, nil
+	return entry, utils.ErrKeyNotFound
+}
+
+func (lm *levelManager) loadCache() {
+
+}
+func (lm *levelManager) loadManifest() (err error) {
+	lm.manifestFile, err = file.OpenManifestFile(&file.Options{Dir: lm.opt.WorkDir})
+	return err
+}
+func (lm *levelManager) build() error {
+	lm.levels = make([]*levelHandler, 0, lm.opt.MaxLevelNum)
+	for i := 0; i < lm.opt.MaxLevelNum; i++ {
+		lm.levels = append(lm.levels, &levelHandler{
+			levelNum: i,
+			tables:   make([]*table, 0),
+			lm:       lm,
+		})
+	}
+
+	manifest := lm.manifestFile.GetManifest()
+	// 对比manifest 文件的正确性
+	if err := lm.manifestFile.RevertToManifest(utils.LoadIDMap(lm.opt.WorkDir)); err != nil {
+		return err
+	}
+	// 逐一加载sstable 的index block 构建cache
+	lm.cache = newCache(lm.opt)
+	// TODO 初始化的时候index 结构放在了table中，相当于全部加载到了内存，减少了一次读磁盘，但增加了内存消耗
+	var maxFID uint64
+	for fID, tableInfo := range manifest.Tables {
+		fileName := utils.FileNameSSTable(lm.opt.WorkDir, fID)
+		if fID > maxFID {
+			maxFID = fID
+		}
+		t := openTable(lm, fileName, nil)
+		lm.levels[tableInfo.Level].add(t)
+		lm.levels[tableInfo.Level].addSize(t) // 记录一个level的文件总大小
+	}
+	// 对每一层进行排序
+	for i := 0; i < lm.opt.MaxLevelNum; i++ {
+		lm.levels[i].Sort()
+	}
+	// 得到最大的fid值
+	atomic.AddUint64(&lm.maxFID, maxFID)
+	return nil
+}
+
+// 向L0层flush一个sstable
+func (lm *levelManager) flush(immutable *memTable) (err error) {
+	// 分配一个fid
+	fid := immutable.wal.Fid()
+	sstName := utils.FileNameSSTable(lm.opt.WorkDir, fid)
+
+	// 构建一个 builder
+	builder := newTableBuiler(lm.opt)
+	iter := immutable.sl.NewSkipListIterator()
+	for iter.Rewind(); iter.Valid(); iter.Next() {
+		entry := iter.Item().Entry()
+		builder.add(entry, false)
+	}
+	// 创建一个 table 对象
+	table := openTable(lm, sstName, builder)
+	err = lm.manifestFile.AddTableMeta(0, &file.TableMeta{
+		ID:       fid,
+		Checksum: []byte{'m', 'o', 'c', 'k'},
+	})
+	// manifest写入失败直接panic
+	utils.Panic(err)
+	// 更新manifest文件
+	lm.levels[0].add(table)
+	return
+}
+
+//--------- level处理器 -------
+type levelHandler struct {
+	sync.RWMutex
+	levelNum       int
+	tables         []*table
+	totalSize      int64
+	totalStaleSize int64
+	lm             *levelManager
+}
+
+func (lh *levelHandler) close() error {
+	for i := range lh.tables {
+		if err := lh.tables[i].ss.Close(); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+func (lh *levelHandler) add(t *table) {
+	lh.Lock()
+	defer lh.Unlock()
+	lh.tables = append(lh.tables, t)
+}
+func (lh *levelHandler) addBatch(ts []*table) {
+	lh.Lock()
+	defer lh.Unlock()
+	lh.tables = append(lh.tables, ts...)
+}
+
+func (lh *levelHandler) getTotalSize() int64 {
+	lh.RLock()
+	defer lh.RUnlock()
+	return lh.totalSize
+}
+
+func (lh *levelHandler) addSize(t *table) {
+	lh.totalSize += t.Size()
+	lh.totalStaleSize += int64(t.StaleDataSize())
+}
+
+func (lh *levelHandler) subtractSize(t *table) {
+	lh.totalSize -= t.Size()
+	lh.totalStaleSize -= int64(t.StaleDataSize())
+}
+
+func (lh *levelHandler) numTables() int {
+	lh.RLock()
+	defer lh.RUnlock()
+	return len(lh.tables)
+}
+
+func (lh *levelHandler) Get(key []byte) (*utils.Entry, error) {
+	// 如果是第0层文件则进行特殊处理
+	if lh.levelNum == 0 {
+		// TODO: logic...
+		// 获取可能存在key的sst
+		return lh.searchL0SST(key)
+	} else {
+		// TODO: logic...
+		return lh.searchLNSST(key)
+	}
+}
+
+func (lh *levelHandler) Sort() {
+	lh.Lock()
+	defer lh.Unlock()
+	if lh.levelNum == 0 {
+		// Key range will overlap. Just sort by fileID in ascending order
+		// because newer tables are at the end of level 0.
+		sort.Slice(lh.tables, func(i, j int) bool {
+			return lh.tables[i].fid < lh.tables[j].fid
+		})
+	} else {
+		// Sort tables by keys.
+		sort.Slice(lh.tables, func(i, j int) bool {
+			return utils.CompareKeys(lh.tables[i].ss.MinKey(), lh.tables[j].ss.MinKey()) < 0
+		})
+	}
+}
+
+func (lh *levelHandler) searchL0SST(key []byte) (*utils.Entry, error) {
+	var version uint64
+	for _, table := range lh.tables {
+		if entry, err := table.Serach(key, &version); err == nil {
+			return entry, nil
+		}
+	}
+	return nil, utils.ErrKeyNotFound
+}
+func (lh *levelHandler) searchLNSST(key []byte) (*utils.Entry, error) {
+	table := lh.getTable(key)
+	var version uint64
+	if table == nil {
+		return nil, utils.ErrKeyNotFound
+	}
+	if entry, err := table.Serach(key, &version); err == nil {
+		return entry, nil
+	}
+	return nil, utils.ErrKeyNotFound
+}
+func (lh *levelHandler) getTable(key []byte) *table {
+	for i := len(lh.tables) - 1; i >= 0; i-- {
+		if bytes.Compare(key, lh.tables[i].ss.MinKey()) > -1 &&
+			bytes.Compare(key, lh.tables[i].ss.MaxKey()) < 1 {
+			return lh.tables[i]
+		}
+	}
+	return nil
+}
+func (lh *levelHandler) isLastLevel() bool {
+	return lh.levelNum == lh.lm.opt.MaxLevelNum-1
+}
+
+type levelHandlerRLocked struct{}
+
+// overlappingTables returns the tables that intersect with key range. Returns a half-interval.
+// This function should already have acquired a read lock, and this is so important the caller must
+// pass an empty parameter declaring such.
+func (lh *levelHandler) overlappingTables(_ levelHandlerRLocked, kr keyRange) (int, int) {
+	if len(kr.left) == 0 || len(kr.right) == 0 {
+		return 0, 0
+	}
+	left := sort.Search(len(lh.tables), func(i int) bool {
+		return utils.CompareKeys(kr.left, lh.tables[i].ss.MaxKey()) <= 0
+	})
+	right := sort.Search(len(lh.tables), func(i int) bool {
+		return utils.CompareKeys(kr.right, lh.tables[i].ss.MaxKey()) < 0
+	})
+	return left, right
+}
+
+// replaceTables will replace tables[left:right] with newTables. Note this EXCLUDES tables[right].
+// You must call decr() to delete the old tables _after_ writing the update to the manifest.
+func (lh *levelHandler) replaceTables(toDel, toAdd []*table) error {
+	// Need to re-search the range of tables in this level to be replaced as other goroutines might
+	// be changing it as well.  (They can't touch our tables, but if they add/remove other tables,
+	// the indices get shifted around.)
+	lh.Lock() // We s.Unlock() below.
+
+	toDelMap := make(map[uint64]struct{})
+	for _, t := range toDel {
+		toDelMap[t.fid] = struct{}{}
+	}
+	var newTables []*table
+	for _, t := range lh.tables {
+		_, found := toDelMap[t.fid]
+		if !found {
+			newTables = append(newTables, t)
+			continue
+		}
+		lh.subtractSize(t)
+	}
+
+	// Increase totalSize first.
+	for _, t := range toAdd {
+		lh.addSize(t)
+		t.IncrRef()
+		newTables = append(newTables, t)
+	}
+
+	// Assign tables.
+	lh.tables = newTables
+	sort.Slice(lh.tables, func(i, j int) bool {
+		return utils.CompareKeys(lh.tables[i].ss.MinKey(), lh.tables[i].ss.MinKey()) < 0
+	})
+	lh.Unlock() // s.Unlock before we DecrRef tables -- that can be slow.
+	return decrRefs(toDel)
+}
+
+// deleteTables remove tables idx0, ..., idx1-1.
+func (lh *levelHandler) deleteTables(toDel []*table) error {
+	lh.Lock() // s.Unlock() below
+
+	toDelMap := make(map[uint64]struct{})
+	for _, t := range toDel {
+		toDelMap[t.fid] = struct{}{}
+	}
+
+	// Make a copy as iterators might be keeping a slice of tables.
+	var newTables []*table
+	for _, t := range lh.tables {
+		_, found := toDelMap[t.fid]
+		if !found {
+			newTables = append(newTables, t)
+			continue
+		}
+		lh.subtractSize(t)
+	}
+	lh.tables = newTables
+
+	lh.Unlock() // Unlock s _before_ we DecrRef our tables, which can be slow.
+
+	return decrRefs(toDel)
+}
+
+func (lh *levelHandler) iterators() []utils.Iterator {
+	lh.RLock()
+	defer lh.RUnlock()
+	topt := &utils.Options{IsAsc: true}
+	if lh.levelNum == 0 {
+		return iteratorsReversed(lh.tables, topt)
+	}
+
+	if len(lh.tables) == 0 {
+		return nil
+	}
+	return []utils.Iterator{NewConcatIterator(lh.tables, topt)}
 }
diff --git a/lsm/lsm.go b/lsm/lsm.go
index 7cec49b..ba13ecc 100644
--- a/lsm/lsm.go
+++ b/lsm/lsm.go
@@ -2,27 +2,50 @@ package lsm
 
 import (
 	"github.com/hardcore-os/corekv/utils"
-	"github.com/hardcore-os/corekv/utils/codec"
 )
 
+// LSM _
 type LSM struct {
 	memTable   *memTable
 	immutables []*memTable
 	levels     *levelManager
 	option     *Options
 	closer     *utils.Closer
+	maxMemFID  uint32
 }
 
-//Options
+//Options _
 type Options struct {
 	WorkDir      string
 	MemTableSize int64
+	SSTableMaxSz int64
+	// BlockSize is the size of each block inside SSTable in bytes.
+	BlockSize int
+	// BloomFalsePositive is the false positive probabiltiy of bloom filter.
+	BloomFalsePositive float64
+
+	// compact
+	NumCompactors       int
+	BaseLevelSize       int64
+	LevelSizeMultiplier int // 决定level之间期望的size比例
+	TableSizeMultiplier int
+	BaseTableSize       int64
+	NumLevelZeroTables  int
+	MaxLevelNum         int
+
+	DiscardStatsCh *chan map[uint32]int64
 }
 
-// 关闭lsm
+// Close  _
 func (lsm *LSM) Close() error {
-	if err := lsm.memTable.close(); err != nil {
-		return err
+	// 等待全部合并过程的结束
+	// 等待全部api调用过程结束
+	lsm.closer.Close()
+	// TODO 需要加锁保证并发安全
+	if lsm.memTable != nil {
+		if err := lsm.memTable.close(); err != nil {
+			return err
+		}
 	}
 	for i := range lsm.immutables {
 		if err := lsm.immutables[i].close(); err != nil {
@@ -32,70 +55,102 @@ func (lsm *LSM) Close() error {
 	if err := lsm.levels.close(); err != nil {
 		return err
 	}
-	// 等待合并过程的结束
-	lsm.closer.Close()
 	return nil
 }
 
-// NewLSM
+// NewLSM _
 func NewLSM(opt *Options) *LSM {
 	lsm := &LSM{option: opt}
-	// 启动DB恢复过程加载wal，如果没有恢复内容则创建新的内存表
-	lsm.memTable, lsm.immutables = recovery(opt)
 	// 初始化levelManager
-	lsm.levels = newLevelManager(opt)
+	lsm.levels = lsm.initLevelManager(opt)
+	// 启动DB恢复过程加载wal，如果没有恢复内容则创建新的内存表
+	lsm.memTable, lsm.immutables = lsm.recovery()
 	// 初始化closer 用于资源回收的信号控制
-	lsm.closer = utils.NewCloser(1)
+	lsm.closer = utils.NewCloser()
 	return lsm
 }
 
-// StartMerge
-func (lsm *LSM) StartMerge() {
-	defer lsm.closer.Done()
-	for {
-		select {
-		case <-lsm.closer.Wait():
-		}
-		// 处理并发的合并过程
+// StartCompacter _
+func (lsm *LSM) StartCompacter() {
+	n := lsm.option.NumCompactors
+	lsm.closer.Add(n)
+	for i := 0; i < n; i++ {
+		go lsm.levels.runCompacter(i)
 	}
 }
 
-func (lsm *LSM) Set(entry *codec.Entry) (err error) {
+// Set _
+func (lsm *LSM) Set(entry *utils.Entry) (err error) {
+	if entry == nil || len(entry.Key) == 0 {
+		return utils.ErrEmptyKey
+	}
+	// 优雅关闭
+	lsm.closer.Add(1)
+	defer lsm.closer.Done()
 	// 检查当前memtable是否写满，是的话创建新的memtable,并将当前内存表写到immutables中
 	// 否则写入当前memtable中
-	if lsm.memTable.Size() > lsm.option.MemTableSize {
-		lsm.immutables = append(lsm.immutables, lsm.memTable)
-		if lsm.memTable, err = NewMemtable(); err != nil {
-			return err
-		}
+	if int64(lsm.memTable.wal.Size())+
+		int64(utils.EstimateWalCodecSize(entry)) > lsm.option.MemTableSize {
+		lsm.Rotate()
 	}
 
-	if err := lsm.memTable.set(entry); err != nil {
+	if err = lsm.memTable.set(entry); err != nil {
 		return err
 	}
 	// 检查是否存在immutable需要刷盘，
 	for _, immutable := range lsm.immutables {
-		if err := lsm.levels.flush(immutable); err != nil {
+		if err = lsm.levels.flush(immutable); err != nil {
 			return err
 		}
+		// TODO 这里问题很大，应该是用引用计数的方式回收
+		err = immutable.close()
+		utils.Panic(err)
 	}
-	return nil
+	if len(lsm.immutables) != 0 {
+		// TODO 将lsm的immutables队列置空，这里可以优化一下节省内存空间，还可以限制一下immut table的大小为固定值
+		lsm.immutables = make([]*memTable, 0)
+	}
+	return err
 }
 
-func (lsm *LSM) Get(key []byte) (*codec.Entry, error) {
+// Get _
+func (lsm *LSM) Get(key []byte) (*utils.Entry, error) {
+	if len(key) == 0 {
+		return nil, utils.ErrEmptyKey
+	}
+	lsm.closer.Add(1)
+	defer lsm.closer.Done()
 	var (
-		entry *codec.Entry
+		entry *utils.Entry
 		err   error
 	)
 	// 从内存表中查询,先查活跃表，在查不变表
-	if entry, err = lsm.memTable.Get(key); entry != nil {
+	if entry, err = lsm.memTable.Get(key); entry != nil && entry.Value != nil {
 		return entry, err
 	}
-	for _, imm := range lsm.immutables {
-		if entry, err = imm.Get(key); entry != nil {
+
+	for i := len(lsm.immutables) - 1; i >= 0; i-- {
+		if entry, err = lsm.immutables[i].Get(key); entry != nil && entry.Value != nil {
 			return entry, err
 		}
 	}
 	// 从level manger查询
 	return lsm.levels.Get(key)
 }
+
+func (lsm *LSM) MemSize() int64 {
+	return lsm.memTable.Size()
+}
+
+func (lsm *LSM) MemTableIsNil() bool {
+	return lsm.memTable == nil
+}
+
+func (lsm *LSM) GetSkipListFromMemTable() *utils.Skiplist {
+	return lsm.memTable.sl
+}
+
+func (lsm *LSM) Rotate() {
+	lsm.immutables = append(lsm.immutables, lsm.memTable)
+	lsm.memTable = lsm.NewMemtable()
+}
diff --git a/lsm/lsm_test.go b/lsm/lsm_test.go
index ed5eb1b..4f4c5e0 100644
--- a/lsm/lsm_test.go
+++ b/lsm/lsm_test.go
@@ -1,56 +1,344 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package lsm
 
 import (
+	"bytes"
+	"fmt"
+	"os"
 	"testing"
+	"time"
 
-	"github.com/hardcore-os/corekv/file"
 	"github.com/hardcore-os/corekv/utils"
-	"github.com/hardcore-os/corekv/utils/codec"
-	"github.com/stretchr/testify/assert"
 )
 
-// 对level 管理器的功能测试
-func TestLevels(t *testing.T) {
-	entrys := []*codec.Entry{
-		{Key: []byte("hello0"), Value: []byte("world0"), ExpiresAt: uint64(0)},
-		{Key: []byte("hello1"), Value: []byte("world1"), ExpiresAt: uint64(0)},
-		{Key: []byte("hello2"), Value: []byte("world2"), ExpiresAt: uint64(0)},
-		{Key: []byte("hello3"), Value: []byte("world3"), ExpiresAt: uint64(0)},
-		{Key: []byte("hello4"), Value: []byte("world4"), ExpiresAt: uint64(0)},
-		{Key: []byte("hello5"), Value: []byte("world5"), ExpiresAt: uint64(0)},
-		{Key: []byte("hello6"), Value: []byte("world6"), ExpiresAt: uint64(0)},
-		{Key: []byte("hello7"), Value: []byte("world"), ExpiresAt: uint64(0)},
-	}
+var (
 	// 初始化opt
-	opt := &Options{
-		"../work_test",
-	}
-	levelLive := func() {
-		// 初始化
-		levels := newLevelManager(opt)
-		defer func() { _ = levels.close() }()
-		// 构建内存表
-		imm := &memTable{
-			wal: file.OpenWalFile(&file.Options{}),
-			sl:  utils.NewSkipList(),
+	opt = &Options{
+		WorkDir:             "../work_test",
+		SSTableMaxSz:        1024,
+		MemTableSize:        1024,
+		BlockSize:           1024,
+		BloomFalsePositive:  0,
+		BaseLevelSize:       10 << 20,
+		LevelSizeMultiplier: 10,
+		BaseTableSize:       2 << 20,
+		TableSizeMultiplier: 2,
+		NumLevelZeroTables:  15,
+		MaxLevelNum:         7,
+		NumCompactors:       3,
+	}
+)
+
+// TestBase 正确性测试
+func TestBase(t *testing.T) {
+	clearDir()
+	lsm := buildLSM()
+	test := func() {
+		// 基准测试
+		baseTest(t, lsm, 128)
+	}
+	// 运行N次测试多个sst的影响
+	runTest(1, test)
+}
+
+// TestRecovery  数据库恢复测试
+func TestRecovery(t *testing.T) {
+	clearDir()
+	recovery := func() {
+		// 每次运行都是相当于意外重启
+		lsm := buildLSM()
+		// 测试正确性
+		baseTest(t, lsm, 128)
+	}
+	// 允许两次就能实现恢复
+	runTest(5, recovery)
+}
+
+// TestClose 测试优雅关闭
+func TestClose(t *testing.T) {
+	clearDir()
+	lsm := buildLSM()
+	lsm.StartCompacter()
+	test := func() {
+		baseTest(t, lsm, 128)
+		utils.Err(lsm.Close())
+		// 重启后可正常工作才算成功
+		lsm = buildLSM()
+		baseTest(t, lsm, 128)
+	}
+	// 运行N次测试多个sst的影响
+	runTest(1, test)
+}
+
+// 命中不同存储介质的逻辑分支测试
+func TestHitStorage(t *testing.T) {
+	clearDir()
+	lsm := buildLSM()
+	e := utils.BuildEntry()
+	lsm.Set(e)
+	// 命中内存表
+	hitMemtable := func() {
+		v, err := lsm.memTable.Get(e.Key)
+		utils.Err(err)
+		utils.CondPanic(!bytes.Equal(v.Value, e.Value), fmt.Errorf("[hitMemtable] !equal(v.Value, e.Value)"))
+	}
+	// 命中L0层
+	hitL0 := func() {
+		// baseTest的测试就包含 在命中L0的sst查询
+		baseTest(t, lsm, 128)
+	}
+	// 命中非L0层
+	hitNotL0 := func() {
+		// 通过压缩将compact生成非L0数据, 会命中l6层
+		lsm.levels.runOnce(0)
+		baseTest(t, lsm, 128)
+	}
+	// 命中bf
+	hitBloom := func() {
+		ee := utils.BuildEntry()
+		// 查询不存在的key 如果命中则说明一定不存在
+		v, err := lsm.levels.levels[0].tables[0].Serach(ee.Key, &ee.Version)
+		utils.CondPanic(v != nil, fmt.Errorf("[hitBloom] v != nil"))
+		utils.CondPanic(err != utils.ErrKeyNotFound, fmt.Errorf("[hitBloom] err != utils.ErrKeyNotFound"))
+	}
+
+	runTest(1, hitMemtable, hitL0, hitNotL0, hitBloom)
+}
+
+// Testparameter 测试异常参数
+func TestPsarameter(t *testing.T) {
+	clearDir()
+	lsm := buildLSM()
+	testNil := func() {
+		utils.CondPanic(lsm.Set(nil) != utils.ErrEmptyKey, fmt.Errorf("[testNil] lsm.Set(nil) != err"))
+		_, err := lsm.Get(nil)
+		utils.CondPanic(err != utils.ErrEmptyKey, fmt.Errorf("[testNil] lsm.Set(nil) != err"))
+	}
+	// TODO p2 优先级的case先忽略
+	runTest(1, testNil)
+}
+
+// TestCompact 测试L0到Lmax压缩
+func TestCompact(t *testing.T) {
+	clearDir()
+	lsm := buildLSM()
+	ok := false
+	l0TOLMax := func() {
+		// 正常触发即可
+		baseTest(t, lsm, 128)
+		// 直接触发压缩执行
+		fid := lsm.levels.maxFID + 1
+		lsm.levels.runOnce(1)
+		for _, t := range lsm.levels.levels[6].tables {
+			if t.fid == fid {
+				ok = true
+			}
 		}
-		for _, entry := range entrys {
-			imm.set(entry)
+		utils.CondPanic(!ok, fmt.Errorf("[l0TOLMax] fid not found"))
+	}
+	l0ToL0 := func() {
+		// 先写一些数据进来
+		baseTest(t, lsm, 128)
+		fid := lsm.levels.maxFID + 1
+		cd := buildCompactDef(lsm, 0, 0, 0)
+		// 非常tricky的处理方法，为了能通过检查
+		tricky(cd.thisLevel.tables)
+		ok := lsm.levels.fillTablesL0ToL0(cd)
+		utils.CondPanic(!ok, fmt.Errorf("[l0ToL0] lsm.levels.fillTablesL0ToL0(cd) ret == false"))
+		err := lsm.levels.runCompactDef(0, 0, *cd)
+		// 删除全局状态，便于下游测试逻辑
+		lsm.levels.compactState.delete(*cd)
+		utils.Err(err)
+		ok = false
+		for _, t := range lsm.levels.levels[0].tables {
+			if t.fid == fid {
+				ok = true
+			}
 		}
-		// 测试 flush
-		assert.Nil(t, levels.flush(imm))
-		// 从levels中进行GET
-		v, err := levels.Get([]byte("Hello"))
-		assert.Nil(t, err)
-		assert.Equal(t, codec.Entry{Value: []byte("Corekv")}.Value, v)
-		t.Logf("levels.Get key=%s, value=%s, expiresAt=%d", v.Key, v.Value, v.Value)
-		// 关闭levels
-		assert.Nil(t, levels.close())
+		utils.CondPanic(!ok, fmt.Errorf("[l0ToL0] fid not found"))
+	}
+	nextCompact := func() {
+		baseTest(t, lsm, 128)
+		fid := lsm.levels.maxFID + 1
+		cd := buildCompactDef(lsm, 0, 0, 1)
+		// 非常tricky的处理方法，为了能通过检查
+		tricky(cd.thisLevel.tables)
+		ok := lsm.levels.fillTables(cd)
+		utils.CondPanic(!ok, fmt.Errorf("[nextCompact] lsm.levels.fillTables(cd) ret == false"))
+		err := lsm.levels.runCompactDef(0, 0, *cd)
+		// 删除全局状态，便于下游测试逻辑
+		lsm.levels.compactState.delete(*cd)
+		utils.Err(err)
+		ok = false
+		for _, t := range lsm.levels.levels[1].tables {
+			if t.fid == fid {
+				ok = true
+			}
+		}
+		utils.CondPanic(!ok, fmt.Errorf("[nextCompact] fid not found"))
+	}
+
+	maxToMax := func() {
+		baseTest(t, lsm, 128)
+		fid := lsm.levels.maxFID + 1
+		cd := buildCompactDef(lsm, 6, 6, 6)
+		// 非常tricky的处理方法，为了能通过检查
+		tricky(cd.thisLevel.tables)
+		ok := lsm.levels.fillTables(cd)
+		utils.CondPanic(!ok, fmt.Errorf("[maxToMax] lsm.levels.fillTables(cd) ret == false"))
+		err := lsm.levels.runCompactDef(0, 6, *cd)
+		// 删除全局状态，便于下游测试逻辑
+		lsm.levels.compactState.delete(*cd)
+		utils.Err(err)
+		ok = false
+		for _, t := range lsm.levels.levels[6].tables {
+			if t.fid == fid {
+				ok = true
+			}
+		}
+		utils.CondPanic(!ok, fmt.Errorf("[maxToMax] fid not found"))
+	}
+	parallerCompact := func() {
+		baseTest(t, lsm, 128)
+		cd := buildCompactDef(lsm, 0, 0, 1)
+		// 非常tricky的处理方法，为了能通过检查
+		tricky(cd.thisLevel.tables)
+		ok := lsm.levels.fillTables(cd)
+		utils.CondPanic(!ok, fmt.Errorf("[parallerCompact] lsm.levels.fillTables(cd) ret == false"))
+		// 构建完全相同两个压缩计划的执行，以便于百分比构建 压缩冲突
+		go lsm.levels.runCompactDef(0, 0, *cd)
+		lsm.levels.runCompactDef(0, 0, *cd)
+		// 检查compact status状态查看是否在执行并行压缩
+		isParaller := false
+		for _, state := range lsm.levels.compactState.levels {
+			if len(state.ranges) != 0 {
+				isParaller = true
+			}
+		}
+		utils.CondPanic(!isParaller, fmt.Errorf("[parallerCompact] not is paralle"))
 	}
 	// 运行N次测试多个sst的影响
-	for i := 0; i < 10; i++ {
-		levelLive()
+	runTest(1, l0TOLMax, l0ToL0, nextCompact, maxToMax, parallerCompact)
+}
+
+// 正确性测试
+func baseTest(t *testing.T, lsm *LSM, n int) {
+	// 用来跟踪调试的
+	e := &utils.Entry{
+		Key:       []byte("CRTS😁硬核课堂MrGSBtL12345678"),
+		Value:     []byte("我草了"),
+		ExpiresAt: 123,
+	}
+	//caseList := make([]*utils.Entry, 0)
+	//caseList = append(caseList, e)
+
+	// 随机构建数据进行测试
+	lsm.Set(e)
+	for i := 1; i < n; i++ {
+		ee := utils.BuildEntry()
+		lsm.Set(ee)
+		// caseList = append(caseList, ee)
 	}
+	// 从levels中进行GET
+	v, err := lsm.Get(e.Key)
+	utils.Panic(err)
+	utils.CondPanic(!bytes.Equal(e.Value, v.Value), fmt.Errorf("lsm.Get(e.Key) value not equal !!!"))
+	// TODO range功能待完善
+	//retList := make([]*utils.Entry, 0)
+	// testRange := func(isAsc bool) {
+	// 	// Range 确保写入进去的每个lsm都可以被读取到
+	// 	iter := lsm.NewIterator(&utils.Options{IsAsc: true})
+	// 	for iter.Rewind(); iter.Valid(); iter.Next() {
+	// 		e := iter.Item().Entry()
+	// 		retList = append(retList, e)
+	// 	}
+	// 	utils.CondPanic(len(retList) != len(caseList), fmt.Errorf("len(retList) != len(caseList)"))
+	// 	sort.Slice(retList, func(i, j int) bool {
+	// 		return utils.CompareKeys(retList[i].Key, retList[j].Key) > 1
+	// 	})
+	// 	for i := 0; i < len(caseList); i++ {
+	// 		a, b := caseList[i], retList[i]
+	// 		if !equal(a.Key, b.Key) || !equal(a.Value, b.Value) || a.ExpiresAt != b.ExpiresAt {
+	// 			utils.Panic(fmt.Errorf("lsm.Get(e.Key) kv disagreement !!!"))
+	// 		}
+	// 	}
+	// }
+	// // 测试升序
+	// testRange(true)
+	// // 测试降序
+	// testRange(false)
 }
 
-// 对level管理器的性能测试
+// 驱动模块
+func buildLSM() *LSM {
+	// init DB Basic Test
+	c := make(chan map[uint32]int64, 16)
+	opt.DiscardStatsCh = &c
+	lsm := NewLSM(opt)
+	return lsm
+}
+
+// 运行测试用例
+func runTest(n int, testFunList ...func()) {
+	for _, f := range testFunList {
+		for i := 0; i < n; i++ {
+			f()
+		}
+	}
+}
+
+// 构建compactDef对象
+func buildCompactDef(lsm *LSM, id, thisLevel, nextLevel int) *compactDef {
+	t := targets{
+		targetSz:  []int64{0, 10485760, 10485760, 10485760, 10485760, 10485760, 10485760},
+		fileSz:    []int64{1024, 2097152, 2097152, 2097152, 2097152, 2097152, 2097152},
+		baseLevel: nextLevel,
+	}
+	def := &compactDef{
+		compactorId: id,
+		thisLevel:   lsm.levels.levels[thisLevel],
+		nextLevel:   lsm.levels.levels[nextLevel],
+		t:           t,
+		p:           buildCompactionPriority(lsm, thisLevel, t),
+	}
+	return def
+}
+
+// 构建CompactionPriority对象
+func buildCompactionPriority(lsm *LSM, thisLevel int, t targets) compactionPriority {
+	return compactionPriority{
+		level:    thisLevel,
+		score:    8.6,
+		adjusted: 860,
+		t:        t,
+	}
+}
+
+func tricky(tables []*table) {
+	// 非常tricky的处理方法，为了能通过检查，检查所有逻辑分支
+	for _, table := range tables {
+		table.ss.Indexs().StaleDataSize = 10 << 20
+		t, _ := time.Parse("2006-01-02 15:04:05", "1995-08-10 00:00:00")
+		table.ss.SetCreatedAt(&t)
+	}
+}
+func clearDir() {
+	_, err := os.Stat(opt.WorkDir)
+	if err == nil {
+		os.RemoveAll(opt.WorkDir)
+	}
+	os.Mkdir(opt.WorkDir, os.ModePerm)
+}
diff --git a/lsm/memtable.go b/lsm/memtable.go
index ee3fb51..3771caf 100644
--- a/lsm/memtable.go
+++ b/lsm/memtable.go
@@ -1,21 +1,57 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package lsm
 
 import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"sort"
+	"strconv"
+	"strings"
+	"sync/atomic"
+
 	"github.com/hardcore-os/corekv/file"
 	"github.com/hardcore-os/corekv/utils"
-	"github.com/hardcore-os/corekv/utils/codec"
+	"github.com/pkg/errors"
 )
 
+const walFileExt string = ".wal"
+
 // MemTable
 type memTable struct {
-	wal *file.WalFile
-	sl  *utils.SkipList
+	lsm        *LSM
+	wal        *file.WalFile
+	sl         *utils.Skiplist
+	buf        *bytes.Buffer
+	maxVersion uint64
 }
 
-//todo: mock, need to add real logic
-func NewMemtable() (*memTable, error) {
-
-	return nil, nil
+// NewMemtable _
+func (lsm *LSM) NewMemtable() *memTable {
+	newFid := atomic.AddUint64(&(lsm.levels.maxFID), 1)
+	fileOpt := &file.Options{
+		Dir:      lsm.option.WorkDir,
+		Flag:     os.O_CREATE | os.O_RDWR,
+		MaxSz:    int(lsm.option.MemTableSize), //TODO wal 要设置多大比较合理？ 姑且跟sst一样大
+		FID:      newFid,
+		FileName: mtFilePath(lsm.option.WorkDir, newFid),
+	}
+	return &memTable{wal: file.OpenWalFile(fileOpt), sl: utils.NewSkiplist(int64(1 << 20)), lsm: lsm}
 }
 
 // Close
@@ -23,36 +59,132 @@ func (m *memTable) close() error {
 	if err := m.wal.Close(); err != nil {
 		return err
 	}
-	if err := m.sl.Close(); err != nil {
-		return err
-	}
+
 	return nil
 }
 
-func (m *memTable) set(entry *codec.Entry) error {
+func (m *memTable) set(entry *utils.Entry) error {
 	// 写到wal 日志中，防止崩溃
 	if err := m.wal.Write(entry); err != nil {
 		return err
 	}
 	// 写到memtable中
-	if err := m.sl.Add(entry); err != nil {
-		return err
-	}
+	m.sl.Add(entry)
 	return nil
 }
 
-func (m *memTable) Get(key []byte) (*codec.Entry, error) {
+func (m *memTable) Get(key []byte) (*utils.Entry, error) {
 	// 索引检查当前的key是否在表中 O(1) 的时间复杂度
 	// 从内存表中获取数据
-	return m.sl.Search(key), nil
+	vs := m.sl.Search(key)
+
+	e := &utils.Entry{
+		Key:       key,
+		Value:     vs.Value,
+		ExpiresAt: vs.ExpiresAt,
+		Meta:      vs.Meta,
+		Version:   vs.Version,
+	}
+
+	return e, nil
+
 }
 
 func (m *memTable) Size() int64 {
-	return m.sl.Size()
+	return m.sl.MemSize()
 }
 
 //recovery
-func recovery(opt *Options) (*memTable, []*memTable) {
-	fileOpt := &file.Options{}
-	return &memTable{wal: file.OpenWalFile(fileOpt), sl: utils.NewSkipList()}, []*memTable{}
+func (lsm *LSM) recovery() (*memTable, []*memTable) {
+	// 从 工作目录中获取所有文件
+	files, err := ioutil.ReadDir(lsm.option.WorkDir)
+	if err != nil {
+		utils.Panic(err)
+		return nil, nil
+	}
+	var fids []uint64
+	maxFid := lsm.levels.maxFID
+	// 识别 后缀为.wal的文件
+	for _, file := range files {
+		if !strings.HasSuffix(file.Name(), walFileExt) {
+			continue
+		}
+		fsz := len(file.Name())
+		fid, err := strconv.ParseUint(file.Name()[:fsz-len(walFileExt)], 10, 64)
+		// 考虑 wal文件的存在 更新maxFid
+		if maxFid < fid {
+			maxFid = fid
+		}
+		if err != nil {
+			utils.Panic(err)
+			return nil, nil
+		}
+		fids = append(fids, fid)
+	}
+	// 排序一下子
+	sort.Slice(fids, func(i, j int) bool {
+		return fids[i] < fids[j]
+	})
+	imms := []*memTable{}
+	// 遍历fid 做处理
+	for _, fid := range fids {
+		mt, err := lsm.openMemTable(fid)
+		utils.CondPanic(err != nil, err)
+		if mt.sl.MemSize() == 0 {
+			// mt.DecrRef()
+			continue
+		}
+		// TODO 如果最后一个跳表没写满会怎么样？这不就浪费空间了吗
+		imms = append(imms, mt)
+	}
+	// 更新最终的maxfid，初始化一定是串行执行的，因此不需要原子操作
+	lsm.levels.maxFID = maxFid
+	return lsm.NewMemtable(), imms
+}
+
+func (lsm *LSM) openMemTable(fid uint64) (*memTable, error) {
+	fileOpt := &file.Options{
+		Dir:      lsm.option.WorkDir,
+		Flag:     os.O_CREATE | os.O_RDWR,
+		MaxSz:    int(lsm.option.MemTableSize),
+		FID:      fid,
+		FileName: mtFilePath(lsm.option.WorkDir, fid),
+	}
+	s := utils.NewSkiplist(int64(1 << 20))
+	mt := &memTable{
+		sl:  s,
+		buf: &bytes.Buffer{},
+		lsm: lsm,
+	}
+	mt.wal = file.OpenWalFile(fileOpt)
+	err := mt.UpdateSkipList()
+	utils.CondPanic(err != nil, errors.WithMessage(err, "while updating skiplist"))
+	return mt, nil
+}
+func mtFilePath(dir string, fid uint64) string {
+	return filepath.Join(dir, fmt.Sprintf("%05d%s", fid, walFileExt))
+}
+
+func (m *memTable) UpdateSkipList() error {
+	if m.wal == nil || m.sl == nil {
+		return nil
+	}
+	endOff, err := m.wal.Iterate(true, 0, m.replayFunction(m.lsm.option))
+	if err != nil {
+		return errors.WithMessage(err, fmt.Sprintf("while iterating wal: %s", m.wal.Name()))
+	}
+	// if endOff < m.wal.Size() {
+	// 	return errors.WithMessage(utils.ErrTruncate, fmt.Sprintf("end offset: %d < size: %d", endOff, m.wal.Size()))
+	// }
+	return m.wal.Truncate(int64(endOff))
+}
+
+func (m *memTable) replayFunction(opt *Options) func(*utils.Entry, *utils.ValuePtr) error {
+	return func(e *utils.Entry, _ *utils.ValuePtr) error { // Function for replaying.
+		if ts := utils.ParseTs(e.Key); ts > m.maxVersion {
+			m.maxVersion = ts
+		}
+		m.sl.Add(e)
+		return nil
+	}
 }
diff --git a/lsm/merge.go b/lsm/merge.go
deleted file mode 100644
index 4b01a4e..0000000
--- a/lsm/merge.go
+++ /dev/null
@@ -1 +0,0 @@
-package lsm
diff --git a/lsm/table.go b/lsm/table.go
index c5b1d02..b2f26f3 100644
--- a/lsm/table.go
+++ b/lsm/table.go
@@ -1,15 +1,399 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package lsm
 
-import "github.com/hardcore-os/corekv/file"
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+	"math"
+	"os"
+	"sort"
+	"strings"
+	"sync/atomic"
+	"time"
+
+	"github.com/hardcore-os/corekv/file"
+	"github.com/hardcore-os/corekv/pb"
+	"github.com/hardcore-os/corekv/utils"
+	"github.com/pkg/errors"
+)
 
 type table struct {
-	ss   *file.SSTable
-	idxs []byte
+	ss  *file.SSTable
+	lm  *levelManager
+	fid uint64
+	ref int32 // For file garbage collection. Atomic.
 }
 
-func openTable(opt *Options, tableName string) *table {
-	t := &table{ss: file.OpenSStable(&file.Options{Name: tableName, Dir: opt.WorkDir})}
-	// 加载ss文件 索引
-	t.idxs = t.ss.Indexs()
+func openTable(lm *levelManager, tableName string, builder *tableBuilder) *table {
+	sstSize := int(lm.opt.SSTableMaxSz)
+	if builder != nil {
+		sstSize = int(builder.done().size)
+	}
+	var (
+		t   *table
+		err error
+	)
+	fid := utils.FID(tableName)
+	// 对builder存在的情况 把buf flush到磁盘
+	if builder != nil {
+		if t, err = builder.flush(lm, tableName); err != nil {
+			utils.Err(err)
+			return nil
+		}
+	} else {
+		t = &table{lm: lm, fid: fid}
+		// 如果没有builder 则创打开一个已经存在的sst文件
+		t.ss = file.OpenSStable(&file.Options{
+			FileName: tableName,
+			Dir:      lm.opt.WorkDir,
+			Flag:     os.O_CREATE | os.O_RDWR,
+			MaxSz:    int(sstSize)})
+	}
+	// 先要引用一下，否则后面使用迭代器会导致引用状态错误
+	t.IncrRef()
+	//  初始化sst文件，把index加载进来
+	if err := t.ss.Init(); err != nil {
+		utils.Err(err)
+		return nil
+	}
+
+	// 获取sst的最大key 需要使用迭代器
+	itr := t.NewIterator(&utils.Options{}) // 默认是降序
+	defer itr.Close()
+	// 定位到初始位置就是最大的key
+	itr.Rewind()
+	utils.CondPanic(!itr.Valid(), errors.Errorf("failed to read index, form maxKey"))
+	maxKey := itr.Item().Entry().Key
+	t.ss.SetMaxKey(maxKey)
+
 	return t
 }
+
+// Serach 从table中查找key
+func (t *table) Serach(key []byte, maxVs *uint64) (entry *utils.Entry, err error) {
+	t.IncrRef()
+	defer t.DecrRef()
+	// 获取索引
+	idx := t.ss.Indexs()
+	// 检查key是否存在
+	bloomFilter := utils.Filter(idx.BloomFilter)
+	if t.ss.HasBloomFilter() && !bloomFilter.MayContainKey(key) {
+		return nil, utils.ErrKeyNotFound
+	}
+	iter := t.NewIterator(&utils.Options{})
+	defer iter.Close()
+
+	iter.Seek(key)
+	if !iter.Valid() {
+		return nil, utils.ErrKeyNotFound
+	}
+
+	if utils.SameKey(key, iter.Item().Entry().Key) {
+		if version := utils.ParseTs(iter.Item().Entry().Key); *maxVs < version {
+			*maxVs = version
+			return iter.Item().Entry(), nil
+		}
+	}
+	return nil, utils.ErrKeyNotFound
+}
+
+func (t *table) indexKey() uint64 {
+	return t.fid
+}
+func (t *table) getEntry(key, block []byte, idx int) (entry *utils.Entry, err error) {
+	if len(block) == 0 {
+		return nil, utils.ErrKeyNotFound
+	}
+	dataStr := string(block)
+	blocks := strings.Split(dataStr, ",")
+	if idx >= 0 && idx < len(blocks) {
+		return &utils.Entry{
+			Key:   key,
+			Value: []byte(blocks[idx]),
+		}, nil
+	}
+	return nil, utils.ErrKeyNotFound
+}
+
+// 去加载sst对应的block
+func (t *table) block(idx int) (*block, error) {
+	utils.CondPanic(idx < 0, fmt.Errorf("idx=%d", idx))
+	if idx >= len(t.ss.Indexs().Offsets) {
+		return nil, errors.New("block out of index")
+	}
+	var b *block
+	key := t.blockCacheKey(idx)
+	blk, ok := t.lm.cache.blocks.Get(key)
+	if ok && blk != nil {
+		b, _ = blk.(*block)
+		return b, nil
+	}
+
+	var ko pb.BlockOffset
+	utils.CondPanic(!t.offsets(&ko, idx), fmt.Errorf("block t.offset id=%d", idx))
+	b = &block{
+		offset: int(ko.GetOffset()),
+	}
+
+	var err error
+	if b.data, err = t.read(b.offset, int(ko.GetLen())); err != nil {
+		return nil, errors.Wrapf(err,
+			"failed to read from sstable: %d at offset: %d, len: %d",
+			t.ss.FID(), b.offset, ko.GetLen())
+	}
+
+	readPos := len(b.data) - 4 // First read checksum length.
+	b.chkLen = int(utils.BytesToU32(b.data[readPos : readPos+4]))
+
+	if b.chkLen > len(b.data) {
+		return nil, errors.New("invalid checksum length. Either the data is " +
+			"corrupted or the table options are incorrectly set")
+	}
+
+	readPos -= b.chkLen
+	b.checksum = b.data[readPos : readPos+b.chkLen]
+
+	readPos -= 4
+	numEntries := int(utils.BytesToU32(b.data[readPos : readPos+4]))
+	entriesIndexStart := readPos - (numEntries * 4)
+	entriesIndexEnd := entriesIndexStart + numEntries*4
+
+	b.entryOffsets = utils.BytesToU32Slice(b.data[entriesIndexStart:entriesIndexEnd])
+
+	b.entriesIndexStart = entriesIndexStart
+
+	b.data = b.data[:readPos+4]
+
+	if err = b.verifyCheckSum(); err != nil {
+		return nil, err
+	}
+
+	t.lm.cache.blocks.Set(key, b)
+
+	return b, nil
+}
+
+func (t *table) read(off, sz int) ([]byte, error) {
+	return t.ss.Bytes(off, sz)
+}
+
+// blockCacheKey is used to store blocks in the block cache.
+func (t *table) blockCacheKey(idx int) []byte {
+	utils.CondPanic(t.fid >= math.MaxUint32, fmt.Errorf("t.fid >= math.MaxUint32"))
+	utils.CondPanic(uint32(idx) >= math.MaxUint32, fmt.Errorf("uint32(idx) >=  math.MaxUint32"))
+
+	buf := make([]byte, 8)
+	// Assume t.ID does not overflow uint32.
+	binary.BigEndian.PutUint32(buf[:4], uint32(t.fid))
+	binary.BigEndian.PutUint32(buf[4:], uint32(idx))
+	return buf
+}
+
+type tableIterator struct {
+	it       utils.Item
+	opt      *utils.Options
+	t        *table
+	blockPos int
+	bi       *blockIterator
+	err      error
+}
+
+func (t *table) NewIterator(options *utils.Options) utils.Iterator {
+	t.IncrRef()
+	return &tableIterator{
+		opt: options,
+		t:   t,
+		bi:  &blockIterator{},
+	}
+}
+func (it *tableIterator) Next() {
+	it.err = nil
+
+	if it.blockPos >= len(it.t.ss.Indexs().GetOffsets()) {
+		it.err = io.EOF
+		return
+	}
+
+	if len(it.bi.data) == 0 {
+		block, err := it.t.block(it.blockPos)
+		if err != nil {
+			it.err = err
+			return
+		}
+		it.bi.tableID = it.t.fid
+		it.bi.blockID = it.blockPos
+		it.bi.setBlock(block)
+		it.bi.seekToFirst()
+		it.err = it.bi.Error()
+		return
+	}
+
+	it.bi.Next()
+	if !it.bi.Valid() {
+		it.blockPos++
+		it.bi.data = nil
+		it.Next()
+		return
+	}
+	it.it = it.bi.it
+}
+func (it *tableIterator) Valid() bool {
+	return it.err != io.EOF // 如果没有的时候 则是EOF
+}
+func (it *tableIterator) Rewind() {
+	if it.opt.IsAsc {
+		it.seekToFirst()
+	} else {
+		it.seekToLast()
+	}
+}
+func (it *tableIterator) Item() utils.Item {
+	return it.it
+}
+func (it *tableIterator) Close() error {
+	it.bi.Close()
+	return it.t.DecrRef()
+}
+func (it *tableIterator) seekToFirst() {
+	numBlocks := len(it.t.ss.Indexs().Offsets)
+	if numBlocks == 0 {
+		it.err = io.EOF
+		return
+	}
+	it.blockPos = 0
+	block, err := it.t.block(it.blockPos)
+	if err != nil {
+		it.err = err
+		return
+	}
+	it.bi.tableID = it.t.fid
+	it.bi.blockID = it.blockPos
+	it.bi.setBlock(block)
+	it.bi.seekToFirst()
+	it.it = it.bi.Item()
+	it.err = it.bi.Error()
+}
+
+func (it *tableIterator) seekToLast() {
+	numBlocks := len(it.t.ss.Indexs().Offsets)
+	if numBlocks == 0 {
+		it.err = io.EOF
+		return
+	}
+	it.blockPos = numBlocks - 1
+	block, err := it.t.block(it.blockPos)
+	if err != nil {
+		it.err = err
+		return
+	}
+	it.bi.tableID = it.t.fid
+	it.bi.blockID = it.blockPos
+	it.bi.setBlock(block)
+	it.bi.seekToLast()
+	it.it = it.bi.Item()
+	it.err = it.bi.Error()
+}
+
+// Seek
+// 二分法搜索 offsets
+// 如果idx == 0 说明key只能在第一个block中 block[0].MinKey <= key
+// 否则 block[0].MinKey > key
+// 如果在 idx-1 的block中未找到key 那才可能在 idx 中
+// 如果都没有，则当前key不再此table
+func (it *tableIterator) Seek(key []byte) {
+	var ko pb.BlockOffset
+	idx := sort.Search(len(it.t.ss.Indexs().GetOffsets()), func(idx int) bool {
+		utils.CondPanic(!it.t.offsets(&ko, idx), fmt.Errorf("tableutils.Seek idx < 0 || idx > len(index.GetOffsets()"))
+		return utils.CompareKeys(ko.GetKey(), key) > 0
+	})
+	if idx == 0 {
+		it.seekHelper(0, key)
+		return
+	}
+	it.seekHelper(idx-1, key)
+	if it.err == io.EOF {
+		if idx == len(it.t.ss.Indexs().Offsets) {
+			return
+		}
+		it.seekHelper(idx, key)
+	}
+}
+
+func (it *tableIterator) seekHelper(blockIdx int, key []byte) {
+	it.blockPos = blockIdx
+	block, err := it.t.block(blockIdx)
+	if err != nil {
+		it.err = err
+		return
+	}
+	it.bi.tableID = it.t.fid
+	it.bi.blockID = it.blockPos
+	it.bi.setBlock(block)
+	it.bi.seek(key)
+	it.err = it.bi.Error()
+	it.it = it.bi.Item()
+}
+
+func (t *table) offsets(ko *pb.BlockOffset, i int) bool {
+	index := t.ss.Indexs()
+	if i < 0 || i > len(index.GetOffsets()) {
+		return false
+	}
+	*ko = *index.GetOffsets()[i]
+	return true
+}
+
+// Size is its file size in bytes
+func (t *table) Size() int64 { return int64(t.ss.Size()) }
+
+// GetCreatedAt
+func (t *table) GetCreatedAt() *time.Time {
+	return t.ss.GetCreatedAt()
+}
+func (t *table) Delete() error {
+	return t.ss.Detele()
+}
+
+// StaleDataSize is the amount of stale data (that can be dropped by a compaction )in this SST.
+func (t *table) StaleDataSize() uint32 { return t.ss.Indexs().StaleDataSize }
+
+// DecrRef decrements the refcount and possibly deletes the table
+func (t *table) DecrRef() error {
+	newRef := atomic.AddInt32(&t.ref, -1)
+	if newRef == 0 {
+		// TODO 从缓存中删除
+		for i := 0; i < len(t.ss.Indexs().GetOffsets()); i++ {
+			t.lm.cache.blocks.Del(t.blockCacheKey(i))
+		}
+		if err := t.Delete(); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (t *table) IncrRef() {
+	atomic.AddInt32(&t.ref, 1)
+}
+func decrRefs(tables []*table) error {
+	for _, table := range tables {
+		if err := table.DecrRef(); err != nil {
+			return err
+		}
+	}
+	return nil
+}
diff --git a/options.go b/options.go
index 279595d..aed8994 100644
--- a/options.go
+++ b/options.go
@@ -1,17 +1,43 @@
+// Copyright 2021 hardcore-o Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package corekv
 
 import "github.com/hardcore-os/corekv/utils"
 
 // Options corekv 总的配置文件
 type Options struct {
-	ValueThreshold int64
-	WorkDir        string
-	MemTableSize   int64
+	ValueThreshold      int64
+	WorkDir             string
+	MemTableSize        int64
+	SSTableMaxSz        int64
+	MaxBatchCount       int64
+	MaxBatchSize        int64 // max batch size in bytes
+	ValueLogFileSize    int
+	VerifyValueChecksum bool
+	ValueLogMaxEntries  uint32
+	LogRotatesToFlush   int32
+	MaxTableSize        int64
 }
 
 // NewDefaultOptions 返回默认的options
 func NewDefaultOptions() *Options {
-	opt := &Options{}
+	opt := &Options{
+		WorkDir:      "./work_test",
+		MemTableSize: 1024,
+		SSTableMaxSz: 1 << 30,
+	}
 	opt.ValueThreshold = utils.DefaultValueThreshold
 	return opt
 }
diff --git a/pb/pb.pb.go b/pb/pb.pb.go
new file mode 100644
index 0000000..1e6e150
--- /dev/null
+++ b/pb/pb.pb.go
@@ -0,0 +1,1915 @@
+// Code generated by protoc-gen-gogo. DO NOT EDIT.
+// source: pb.proto
+
+package pb
+
+import (
+	fmt "fmt"
+	proto "github.com/golang/protobuf/proto"
+	io "io"
+	math "math"
+	math_bits "math/bits"
+)
+
+// Reference imports to suppress errors if they are not otherwise used.
+var _ = proto.Marshal
+var _ = fmt.Errorf
+var _ = math.Inf
+
+// This is a compile-time assertion to ensure that this generated file
+// is compatible with the proto package it is being compiled against.
+// A compilation error at this line likely means your copy of the
+// proto package needs to be updated.
+const _ = proto.ProtoPackageIsVersion3 // please upgrade the proto package
+
+type ManifestChange_Operation int32
+
+const (
+	ManifestChange_CREATE ManifestChange_Operation = 0
+	ManifestChange_DELETE ManifestChange_Operation = 1
+)
+
+var ManifestChange_Operation_name = map[int32]string{
+	0: "CREATE",
+	1: "DELETE",
+}
+
+var ManifestChange_Operation_value = map[string]int32{
+	"CREATE": 0,
+	"DELETE": 1,
+}
+
+func (x ManifestChange_Operation) String() string {
+	return proto.EnumName(ManifestChange_Operation_name, int32(x))
+}
+
+func (ManifestChange_Operation) EnumDescriptor() ([]byte, []int) {
+	return fileDescriptor_f80abaa17e25ccc8, []int{3, 0}
+}
+
+type KV struct {
+	Key       []byte `protobuf:"bytes,1,opt,name=key,proto3" json:"key,omitempty"`
+	Value     []byte `protobuf:"bytes,2,opt,name=value,proto3" json:"value,omitempty"`
+	UserMeta  []byte `protobuf:"bytes,3,opt,name=user_meta,json=userMeta,proto3" json:"user_meta,omitempty"`
+	Version   uint64 `protobuf:"varint,4,opt,name=version,proto3" json:"version,omitempty"`
+	ExpiresAt uint64 `protobuf:"varint,5,opt,name=expires_at,json=expiresAt,proto3" json:"expires_at,omitempty"`
+	Meta      []byte `protobuf:"bytes,6,opt,name=meta,proto3" json:"meta,omitempty"`
+	// Stream id is used to identify which stream the KV came from.
+	StreamId             uint32   `protobuf:"varint,10,opt,name=stream_id,json=streamId,proto3" json:"stream_id,omitempty"`
+	XXX_NoUnkeyedLiteral struct{} `json:"-"`
+	XXX_unrecognized     []byte   `json:"-"`
+	XXX_sizecache        int32    `json:"-"`
+}
+
+func (m *KV) Reset()         { *m = KV{} }
+func (m *KV) String() string { return proto.CompactTextString(m) }
+func (*KV) ProtoMessage()    {}
+func (*KV) Descriptor() ([]byte, []int) {
+	return fileDescriptor_f80abaa17e25ccc8, []int{0}
+}
+func (m *KV) XXX_Unmarshal(b []byte) error {
+	return m.Unmarshal(b)
+}
+func (m *KV) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
+	if deterministic {
+		return xxx_messageInfo_KV.Marshal(b, m, deterministic)
+	} else {
+		b = b[:cap(b)]
+		n, err := m.MarshalToSizedBuffer(b)
+		if err != nil {
+			return nil, err
+		}
+		return b[:n], nil
+	}
+}
+func (m *KV) XXX_Merge(src proto.Message) {
+	xxx_messageInfo_KV.Merge(m, src)
+}
+func (m *KV) XXX_Size() int {
+	return m.Size()
+}
+func (m *KV) XXX_DiscardUnknown() {
+	xxx_messageInfo_KV.DiscardUnknown(m)
+}
+
+var xxx_messageInfo_KV proto.InternalMessageInfo
+
+func (m *KV) GetKey() []byte {
+	if m != nil {
+		return m.Key
+	}
+	return nil
+}
+
+func (m *KV) GetValue() []byte {
+	if m != nil {
+		return m.Value
+	}
+	return nil
+}
+
+func (m *KV) GetUserMeta() []byte {
+	if m != nil {
+		return m.UserMeta
+	}
+	return nil
+}
+
+func (m *KV) GetVersion() uint64 {
+	if m != nil {
+		return m.Version
+	}
+	return 0
+}
+
+func (m *KV) GetExpiresAt() uint64 {
+	if m != nil {
+		return m.ExpiresAt
+	}
+	return 0
+}
+
+func (m *KV) GetMeta() []byte {
+	if m != nil {
+		return m.Meta
+	}
+	return nil
+}
+
+func (m *KV) GetStreamId() uint32 {
+	if m != nil {
+		return m.StreamId
+	}
+	return 0
+}
+
+type KVList struct {
+	Kv                   []*KV    `protobuf:"bytes,1,rep,name=kv,proto3" json:"kv,omitempty"`
+	XXX_NoUnkeyedLiteral struct{} `json:"-"`
+	XXX_unrecognized     []byte   `json:"-"`
+	XXX_sizecache        int32    `json:"-"`
+}
+
+func (m *KVList) Reset()         { *m = KVList{} }
+func (m *KVList) String() string { return proto.CompactTextString(m) }
+func (*KVList) ProtoMessage()    {}
+func (*KVList) Descriptor() ([]byte, []int) {
+	return fileDescriptor_f80abaa17e25ccc8, []int{1}
+}
+func (m *KVList) XXX_Unmarshal(b []byte) error {
+	return m.Unmarshal(b)
+}
+func (m *KVList) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
+	if deterministic {
+		return xxx_messageInfo_KVList.Marshal(b, m, deterministic)
+	} else {
+		b = b[:cap(b)]
+		n, err := m.MarshalToSizedBuffer(b)
+		if err != nil {
+			return nil, err
+		}
+		return b[:n], nil
+	}
+}
+func (m *KVList) XXX_Merge(src proto.Message) {
+	xxx_messageInfo_KVList.Merge(m, src)
+}
+func (m *KVList) XXX_Size() int {
+	return m.Size()
+}
+func (m *KVList) XXX_DiscardUnknown() {
+	xxx_messageInfo_KVList.DiscardUnknown(m)
+}
+
+var xxx_messageInfo_KVList proto.InternalMessageInfo
+
+func (m *KVList) GetKv() []*KV {
+	if m != nil {
+		return m.Kv
+	}
+	return nil
+}
+
+type ManifestChangeSet struct {
+	// A set of changes that are applied atomically.
+	Changes              []*ManifestChange `protobuf:"bytes,1,rep,name=changes,proto3" json:"changes,omitempty"`
+	XXX_NoUnkeyedLiteral struct{}          `json:"-"`
+	XXX_unrecognized     []byte            `json:"-"`
+	XXX_sizecache        int32             `json:"-"`
+}
+
+func (m *ManifestChangeSet) Reset()         { *m = ManifestChangeSet{} }
+func (m *ManifestChangeSet) String() string { return proto.CompactTextString(m) }
+func (*ManifestChangeSet) ProtoMessage()    {}
+func (*ManifestChangeSet) Descriptor() ([]byte, []int) {
+	return fileDescriptor_f80abaa17e25ccc8, []int{2}
+}
+func (m *ManifestChangeSet) XXX_Unmarshal(b []byte) error {
+	return m.Unmarshal(b)
+}
+func (m *ManifestChangeSet) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
+	if deterministic {
+		return xxx_messageInfo_ManifestChangeSet.Marshal(b, m, deterministic)
+	} else {
+		b = b[:cap(b)]
+		n, err := m.MarshalToSizedBuffer(b)
+		if err != nil {
+			return nil, err
+		}
+		return b[:n], nil
+	}
+}
+func (m *ManifestChangeSet) XXX_Merge(src proto.Message) {
+	xxx_messageInfo_ManifestChangeSet.Merge(m, src)
+}
+func (m *ManifestChangeSet) XXX_Size() int {
+	return m.Size()
+}
+func (m *ManifestChangeSet) XXX_DiscardUnknown() {
+	xxx_messageInfo_ManifestChangeSet.DiscardUnknown(m)
+}
+
+var xxx_messageInfo_ManifestChangeSet proto.InternalMessageInfo
+
+func (m *ManifestChangeSet) GetChanges() []*ManifestChange {
+	if m != nil {
+		return m.Changes
+	}
+	return nil
+}
+
+type ManifestChange struct {
+	Id                   uint64                   `protobuf:"varint,1,opt,name=Id,proto3" json:"Id,omitempty"`
+	Op                   ManifestChange_Operation `protobuf:"varint,2,opt,name=Op,proto3,enum=pb.ManifestChange_Operation" json:"Op,omitempty"`
+	Level                uint32                   `protobuf:"varint,3,opt,name=Level,proto3" json:"Level,omitempty"`
+	Checksum             []byte                   `protobuf:"bytes,4,opt,name=Checksum,proto3" json:"Checksum,omitempty"`
+	XXX_NoUnkeyedLiteral struct{}                 `json:"-"`
+	XXX_unrecognized     []byte                   `json:"-"`
+	XXX_sizecache        int32                    `json:"-"`
+}
+
+func (m *ManifestChange) Reset()         { *m = ManifestChange{} }
+func (m *ManifestChange) String() string { return proto.CompactTextString(m) }
+func (*ManifestChange) ProtoMessage()    {}
+func (*ManifestChange) Descriptor() ([]byte, []int) {
+	return fileDescriptor_f80abaa17e25ccc8, []int{3}
+}
+func (m *ManifestChange) XXX_Unmarshal(b []byte) error {
+	return m.Unmarshal(b)
+}
+func (m *ManifestChange) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
+	if deterministic {
+		return xxx_messageInfo_ManifestChange.Marshal(b, m, deterministic)
+	} else {
+		b = b[:cap(b)]
+		n, err := m.MarshalToSizedBuffer(b)
+		if err != nil {
+			return nil, err
+		}
+		return b[:n], nil
+	}
+}
+func (m *ManifestChange) XXX_Merge(src proto.Message) {
+	xxx_messageInfo_ManifestChange.Merge(m, src)
+}
+func (m *ManifestChange) XXX_Size() int {
+	return m.Size()
+}
+func (m *ManifestChange) XXX_DiscardUnknown() {
+	xxx_messageInfo_ManifestChange.DiscardUnknown(m)
+}
+
+var xxx_messageInfo_ManifestChange proto.InternalMessageInfo
+
+func (m *ManifestChange) GetId() uint64 {
+	if m != nil {
+		return m.Id
+	}
+	return 0
+}
+
+func (m *ManifestChange) GetOp() ManifestChange_Operation {
+	if m != nil {
+		return m.Op
+	}
+	return ManifestChange_CREATE
+}
+
+func (m *ManifestChange) GetLevel() uint32 {
+	if m != nil {
+		return m.Level
+	}
+	return 0
+}
+
+func (m *ManifestChange) GetChecksum() []byte {
+	if m != nil {
+		return m.Checksum
+	}
+	return nil
+}
+
+type TableIndex struct {
+	Offsets              []*BlockOffset `protobuf:"bytes,1,rep,name=offsets,proto3" json:"offsets,omitempty"`
+	BloomFilter          []byte         `protobuf:"bytes,2,opt,name=bloomFilter,proto3" json:"bloomFilter,omitempty"`
+	MaxVersion           uint64         `protobuf:"varint,3,opt,name=maxVersion,proto3" json:"maxVersion,omitempty"`
+	KeyCount             uint32         `protobuf:"varint,4,opt,name=keyCount,proto3" json:"keyCount,omitempty"`
+	StaleDataSize        uint32         `protobuf:"varint,5,opt,name=staleDataSize,proto3" json:"staleDataSize,omitempty"`
+	XXX_NoUnkeyedLiteral struct{}       `json:"-"`
+	XXX_unrecognized     []byte         `json:"-"`
+	XXX_sizecache        int32          `json:"-"`
+}
+
+func (m *TableIndex) Reset()         { *m = TableIndex{} }
+func (m *TableIndex) String() string { return proto.CompactTextString(m) }
+func (*TableIndex) ProtoMessage()    {}
+func (*TableIndex) Descriptor() ([]byte, []int) {
+	return fileDescriptor_f80abaa17e25ccc8, []int{4}
+}
+func (m *TableIndex) XXX_Unmarshal(b []byte) error {
+	return m.Unmarshal(b)
+}
+func (m *TableIndex) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
+	if deterministic {
+		return xxx_messageInfo_TableIndex.Marshal(b, m, deterministic)
+	} else {
+		b = b[:cap(b)]
+		n, err := m.MarshalToSizedBuffer(b)
+		if err != nil {
+			return nil, err
+		}
+		return b[:n], nil
+	}
+}
+func (m *TableIndex) XXX_Merge(src proto.Message) {
+	xxx_messageInfo_TableIndex.Merge(m, src)
+}
+func (m *TableIndex) XXX_Size() int {
+	return m.Size()
+}
+func (m *TableIndex) XXX_DiscardUnknown() {
+	xxx_messageInfo_TableIndex.DiscardUnknown(m)
+}
+
+var xxx_messageInfo_TableIndex proto.InternalMessageInfo
+
+func (m *TableIndex) GetOffsets() []*BlockOffset {
+	if m != nil {
+		return m.Offsets
+	}
+	return nil
+}
+
+func (m *TableIndex) GetBloomFilter() []byte {
+	if m != nil {
+		return m.BloomFilter
+	}
+	return nil
+}
+
+func (m *TableIndex) GetMaxVersion() uint64 {
+	if m != nil {
+		return m.MaxVersion
+	}
+	return 0
+}
+
+func (m *TableIndex) GetKeyCount() uint32 {
+	if m != nil {
+		return m.KeyCount
+	}
+	return 0
+}
+
+func (m *TableIndex) GetStaleDataSize() uint32 {
+	if m != nil {
+		return m.StaleDataSize
+	}
+	return 0
+}
+
+type BlockOffset struct {
+	Key                  []byte   `protobuf:"bytes,1,opt,name=key,proto3" json:"key,omitempty"`
+	Offset               uint32   `protobuf:"varint,2,opt,name=offset,proto3" json:"offset,omitempty"`
+	Len                  uint32   `protobuf:"varint,3,opt,name=len,proto3" json:"len,omitempty"`
+	XXX_NoUnkeyedLiteral struct{} `json:"-"`
+	XXX_unrecognized     []byte   `json:"-"`
+	XXX_sizecache        int32    `json:"-"`
+}
+
+func (m *BlockOffset) Reset()         { *m = BlockOffset{} }
+func (m *BlockOffset) String() string { return proto.CompactTextString(m) }
+func (*BlockOffset) ProtoMessage()    {}
+func (*BlockOffset) Descriptor() ([]byte, []int) {
+	return fileDescriptor_f80abaa17e25ccc8, []int{5}
+}
+func (m *BlockOffset) XXX_Unmarshal(b []byte) error {
+	return m.Unmarshal(b)
+}
+func (m *BlockOffset) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
+	if deterministic {
+		return xxx_messageInfo_BlockOffset.Marshal(b, m, deterministic)
+	} else {
+		b = b[:cap(b)]
+		n, err := m.MarshalToSizedBuffer(b)
+		if err != nil {
+			return nil, err
+		}
+		return b[:n], nil
+	}
+}
+func (m *BlockOffset) XXX_Merge(src proto.Message) {
+	xxx_messageInfo_BlockOffset.Merge(m, src)
+}
+func (m *BlockOffset) XXX_Size() int {
+	return m.Size()
+}
+func (m *BlockOffset) XXX_DiscardUnknown() {
+	xxx_messageInfo_BlockOffset.DiscardUnknown(m)
+}
+
+var xxx_messageInfo_BlockOffset proto.InternalMessageInfo
+
+func (m *BlockOffset) GetKey() []byte {
+	if m != nil {
+		return m.Key
+	}
+	return nil
+}
+
+func (m *BlockOffset) GetOffset() uint32 {
+	if m != nil {
+		return m.Offset
+	}
+	return 0
+}
+
+func (m *BlockOffset) GetLen() uint32 {
+	if m != nil {
+		return m.Len
+	}
+	return 0
+}
+
+func init() {
+	proto.RegisterEnum("pb.ManifestChange_Operation", ManifestChange_Operation_name, ManifestChange_Operation_value)
+	proto.RegisterType((*KV)(nil), "pb.KV")
+	proto.RegisterType((*KVList)(nil), "pb.KVList")
+	proto.RegisterType((*ManifestChangeSet)(nil), "pb.ManifestChangeSet")
+	proto.RegisterType((*ManifestChange)(nil), "pb.ManifestChange")
+	proto.RegisterType((*TableIndex)(nil), "pb.TableIndex")
+	proto.RegisterType((*BlockOffset)(nil), "pb.BlockOffset")
+}
+
+func init() { proto.RegisterFile("pb.proto", fileDescriptor_f80abaa17e25ccc8) }
+
+var fileDescriptor_f80abaa17e25ccc8 = []byte{
+	// 485 bytes of a gzipped FileDescriptorProto
+	0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x6c, 0x52, 0x5d, 0x6e, 0xda, 0x4c,
+	0x14, 0xcd, 0x18, 0x62, 0xe0, 0x12, 0xf3, 0xf1, 0x8d, 0xaa, 0xc8, 0xea, 0x0f, 0xb2, 0xdc, 0x3e,
+	0x50, 0x29, 0xe2, 0x21, 0x5d, 0x01, 0x21, 0x54, 0xb2, 0x20, 0x42, 0x9a, 0x20, 0x5e, 0xd1, 0x18,
+	0x5f, 0x1a, 0xcb, 0xbf, 0xf2, 0x0c, 0x16, 0xe9, 0x4a, 0xba, 0x81, 0xae, 0xa0, 0x8f, 0xdd, 0x40,
+	0x1f, 0xbb, 0x84, 0x8a, 0x6e, 0xa4, 0x9a, 0xc1, 0x20, 0x50, 0xfb, 0x76, 0xcf, 0xb9, 0x77, 0xce,
+	0x9c, 0x39, 0x77, 0xa0, 0x99, 0xfb, 0x83, 0xbc, 0xc8, 0x64, 0x46, 0x8d, 0xdc, 0x77, 0xbf, 0x11,
+	0x30, 0x26, 0x0b, 0xda, 0x85, 0x5a, 0x84, 0xcf, 0x36, 0x71, 0x48, 0xff, 0x8a, 0xa9, 0x92, 0xbe,
+	0x80, 0xcb, 0x92, 0xc7, 0x1b, 0xb4, 0x0d, 0xcd, 0xed, 0x01, 0x7d, 0x05, 0xad, 0x8d, 0xc0, 0x62,
+	0x99, 0xa0, 0xe4, 0x76, 0x4d, 0x77, 0x9a, 0x8a, 0x78, 0x40, 0xc9, 0xa9, 0x0d, 0x8d, 0x12, 0x0b,
+	0x11, 0x66, 0xa9, 0x5d, 0x77, 0x48, 0xbf, 0xce, 0x0e, 0x90, 0xbe, 0x01, 0xc0, 0x6d, 0x1e, 0x16,
+	0x28, 0x96, 0x5c, 0xda, 0x97, 0xba, 0xd9, 0xaa, 0x98, 0xa1, 0xa4, 0x14, 0xea, 0x5a, 0xd0, 0xd4,
+	0x82, 0xba, 0x56, 0x37, 0x09, 0x59, 0x20, 0x4f, 0x96, 0x61, 0x60, 0x83, 0x43, 0xfa, 0x16, 0x6b,
+	0xee, 0x09, 0x2f, 0x70, 0x1d, 0x30, 0x27, 0x8b, 0x69, 0x28, 0x24, 0xbd, 0x06, 0x23, 0x2a, 0x6d,
+	0xe2, 0xd4, 0xfa, 0xed, 0x5b, 0x73, 0x90, 0xfb, 0x83, 0xc9, 0x82, 0x19, 0x51, 0xe9, 0x0e, 0xe1,
+	0xff, 0x07, 0x9e, 0x86, 0x6b, 0x14, 0x72, 0xf4, 0xc4, 0xd3, 0x4f, 0xf8, 0x88, 0x92, 0xde, 0x40,
+	0x63, 0xa5, 0x81, 0xa8, 0x4e, 0x50, 0x75, 0xe2, 0x7c, 0x8e, 0x1d, 0x46, 0xdc, 0xaf, 0x04, 0x3a,
+	0xe7, 0x3d, 0xda, 0x01, 0xc3, 0x0b, 0x74, 0x4a, 0x75, 0x66, 0x78, 0x01, 0xbd, 0x01, 0x63, 0x96,
+	0xeb, 0x84, 0x3a, 0xb7, 0xaf, 0xff, 0xd6, 0x1a, 0xcc, 0x72, 0x2c, 0xb8, 0x0c, 0xb3, 0x94, 0x19,
+	0xb3, 0x5c, 0x45, 0x3a, 0xc5, 0x12, 0x63, 0x1d, 0x9c, 0xc5, 0xf6, 0x80, 0xbe, 0x84, 0xe6, 0xe8,
+	0x09, 0x57, 0x91, 0xd8, 0x24, 0x3a, 0xb6, 0x2b, 0x76, 0xc4, 0xee, 0x5b, 0x68, 0x1d, 0x25, 0x28,
+	0x80, 0x39, 0x62, 0xe3, 0xe1, 0x7c, 0xdc, 0xbd, 0x50, 0xf5, 0xfd, 0x78, 0x3a, 0x9e, 0x8f, 0xbb,
+	0xc4, 0xfd, 0x4e, 0x00, 0xe6, 0xdc, 0x8f, 0xd1, 0x4b, 0x03, 0xdc, 0xd2, 0xf7, 0xd0, 0xc8, 0xd6,
+	0x6b, 0x81, 0xf2, 0xf0, 0xc8, 0xff, 0x94, 0xb1, 0xbb, 0x38, 0x5b, 0x45, 0x33, 0xcd, 0xb3, 0x43,
+	0x9f, 0x3a, 0xd0, 0xf6, 0xe3, 0x2c, 0x4b, 0x3e, 0x86, 0xb1, 0xc4, 0xa2, 0xda, 0xf4, 0x29, 0x45,
+	0x7b, 0x00, 0x09, 0xdf, 0x2e, 0xaa, 0xad, 0xd6, 0xf4, 0xc3, 0x4f, 0x18, 0x65, 0x3e, 0xc2, 0xe7,
+	0x51, 0xb6, 0x49, 0xa5, 0x36, 0x6f, 0xb1, 0x23, 0xa6, 0xef, 0xc0, 0x12, 0x92, 0xc7, 0x78, 0xcf,
+	0x25, 0x7f, 0x0c, 0x3f, 0xa3, 0xde, 0xbb, 0xc5, 0xce, 0x49, 0xd7, 0x83, 0xf6, 0x89, 0xb7, 0x7f,
+	0x7c, 0xc4, 0x6b, 0x30, 0xf7, 0x7e, 0xb5, 0x3f, 0x8b, 0x55, 0x48, 0x4d, 0xc6, 0x98, 0x56, 0x59,
+	0xaa, 0xf2, 0xae, 0xfb, 0x63, 0xd7, 0x23, 0x3f, 0x77, 0x3d, 0xf2, 0x6b, 0xd7, 0x23, 0x5f, 0x7e,
+	0xf7, 0x2e, 0x7c, 0x53, 0x7f, 0xf4, 0x0f, 0x7f, 0x02, 0x00, 0x00, 0xff, 0xff, 0x9e, 0xf4, 0xb3,
+	0x68, 0xf4, 0x02, 0x00, 0x00,
+}
+
+func (m *KV) Marshal() (dAtA []byte, err error) {
+	size := m.Size()
+	dAtA = make([]byte, size)
+	n, err := m.MarshalToSizedBuffer(dAtA[:size])
+	if err != nil {
+		return nil, err
+	}
+	return dAtA[:n], nil
+}
+
+func (m *KV) MarshalTo(dAtA []byte) (int, error) {
+	size := m.Size()
+	return m.MarshalToSizedBuffer(dAtA[:size])
+}
+
+func (m *KV) MarshalToSizedBuffer(dAtA []byte) (int, error) {
+	i := len(dAtA)
+	_ = i
+	var l int
+	_ = l
+	if m.XXX_unrecognized != nil {
+		i -= len(m.XXX_unrecognized)
+		copy(dAtA[i:], m.XXX_unrecognized)
+	}
+	if m.StreamId != 0 {
+		i = encodeVarintPb(dAtA, i, uint64(m.StreamId))
+		i--
+		dAtA[i] = 0x50
+	}
+	if len(m.Meta) > 0 {
+		i -= len(m.Meta)
+		copy(dAtA[i:], m.Meta)
+		i = encodeVarintPb(dAtA, i, uint64(len(m.Meta)))
+		i--
+		dAtA[i] = 0x32
+	}
+	if m.ExpiresAt != 0 {
+		i = encodeVarintPb(dAtA, i, uint64(m.ExpiresAt))
+		i--
+		dAtA[i] = 0x28
+	}
+	if m.Version != 0 {
+		i = encodeVarintPb(dAtA, i, uint64(m.Version))
+		i--
+		dAtA[i] = 0x20
+	}
+	if len(m.UserMeta) > 0 {
+		i -= len(m.UserMeta)
+		copy(dAtA[i:], m.UserMeta)
+		i = encodeVarintPb(dAtA, i, uint64(len(m.UserMeta)))
+		i--
+		dAtA[i] = 0x1a
+	}
+	if len(m.Value) > 0 {
+		i -= len(m.Value)
+		copy(dAtA[i:], m.Value)
+		i = encodeVarintPb(dAtA, i, uint64(len(m.Value)))
+		i--
+		dAtA[i] = 0x12
+	}
+	if len(m.Key) > 0 {
+		i -= len(m.Key)
+		copy(dAtA[i:], m.Key)
+		i = encodeVarintPb(dAtA, i, uint64(len(m.Key)))
+		i--
+		dAtA[i] = 0xa
+	}
+	return len(dAtA) - i, nil
+}
+
+func (m *KVList) Marshal() (dAtA []byte, err error) {
+	size := m.Size()
+	dAtA = make([]byte, size)
+	n, err := m.MarshalToSizedBuffer(dAtA[:size])
+	if err != nil {
+		return nil, err
+	}
+	return dAtA[:n], nil
+}
+
+func (m *KVList) MarshalTo(dAtA []byte) (int, error) {
+	size := m.Size()
+	return m.MarshalToSizedBuffer(dAtA[:size])
+}
+
+func (m *KVList) MarshalToSizedBuffer(dAtA []byte) (int, error) {
+	i := len(dAtA)
+	_ = i
+	var l int
+	_ = l
+	if m.XXX_unrecognized != nil {
+		i -= len(m.XXX_unrecognized)
+		copy(dAtA[i:], m.XXX_unrecognized)
+	}
+	if len(m.Kv) > 0 {
+		for iNdEx := len(m.Kv) - 1; iNdEx >= 0; iNdEx-- {
+			{
+				size, err := m.Kv[iNdEx].MarshalToSizedBuffer(dAtA[:i])
+				if err != nil {
+					return 0, err
+				}
+				i -= size
+				i = encodeVarintPb(dAtA, i, uint64(size))
+			}
+			i--
+			dAtA[i] = 0xa
+		}
+	}
+	return len(dAtA) - i, nil
+}
+
+func (m *ManifestChangeSet) Marshal() (dAtA []byte, err error) {
+	size := m.Size()
+	dAtA = make([]byte, size)
+	n, err := m.MarshalToSizedBuffer(dAtA[:size])
+	if err != nil {
+		return nil, err
+	}
+	return dAtA[:n], nil
+}
+
+func (m *ManifestChangeSet) MarshalTo(dAtA []byte) (int, error) {
+	size := m.Size()
+	return m.MarshalToSizedBuffer(dAtA[:size])
+}
+
+func (m *ManifestChangeSet) MarshalToSizedBuffer(dAtA []byte) (int, error) {
+	i := len(dAtA)
+	_ = i
+	var l int
+	_ = l
+	if m.XXX_unrecognized != nil {
+		i -= len(m.XXX_unrecognized)
+		copy(dAtA[i:], m.XXX_unrecognized)
+	}
+	if len(m.Changes) > 0 {
+		for iNdEx := len(m.Changes) - 1; iNdEx >= 0; iNdEx-- {
+			{
+				size, err := m.Changes[iNdEx].MarshalToSizedBuffer(dAtA[:i])
+				if err != nil {
+					return 0, err
+				}
+				i -= size
+				i = encodeVarintPb(dAtA, i, uint64(size))
+			}
+			i--
+			dAtA[i] = 0xa
+		}
+	}
+	return len(dAtA) - i, nil
+}
+
+func (m *ManifestChange) Marshal() (dAtA []byte, err error) {
+	size := m.Size()
+	dAtA = make([]byte, size)
+	n, err := m.MarshalToSizedBuffer(dAtA[:size])
+	if err != nil {
+		return nil, err
+	}
+	return dAtA[:n], nil
+}
+
+func (m *ManifestChange) MarshalTo(dAtA []byte) (int, error) {
+	size := m.Size()
+	return m.MarshalToSizedBuffer(dAtA[:size])
+}
+
+func (m *ManifestChange) MarshalToSizedBuffer(dAtA []byte) (int, error) {
+	i := len(dAtA)
+	_ = i
+	var l int
+	_ = l
+	if m.XXX_unrecognized != nil {
+		i -= len(m.XXX_unrecognized)
+		copy(dAtA[i:], m.XXX_unrecognized)
+	}
+	if len(m.Checksum) > 0 {
+		i -= len(m.Checksum)
+		copy(dAtA[i:], m.Checksum)
+		i = encodeVarintPb(dAtA, i, uint64(len(m.Checksum)))
+		i--
+		dAtA[i] = 0x22
+	}
+	if m.Level != 0 {
+		i = encodeVarintPb(dAtA, i, uint64(m.Level))
+		i--
+		dAtA[i] = 0x18
+	}
+	if m.Op != 0 {
+		i = encodeVarintPb(dAtA, i, uint64(m.Op))
+		i--
+		dAtA[i] = 0x10
+	}
+	if m.Id != 0 {
+		i = encodeVarintPb(dAtA, i, uint64(m.Id))
+		i--
+		dAtA[i] = 0x8
+	}
+	return len(dAtA) - i, nil
+}
+
+func (m *TableIndex) Marshal() (dAtA []byte, err error) {
+	size := m.Size()
+	dAtA = make([]byte, size)
+	n, err := m.MarshalToSizedBuffer(dAtA[:size])
+	if err != nil {
+		return nil, err
+	}
+	return dAtA[:n], nil
+}
+
+func (m *TableIndex) MarshalTo(dAtA []byte) (int, error) {
+	size := m.Size()
+	return m.MarshalToSizedBuffer(dAtA[:size])
+}
+
+func (m *TableIndex) MarshalToSizedBuffer(dAtA []byte) (int, error) {
+	i := len(dAtA)
+	_ = i
+	var l int
+	_ = l
+	if m.XXX_unrecognized != nil {
+		i -= len(m.XXX_unrecognized)
+		copy(dAtA[i:], m.XXX_unrecognized)
+	}
+	if m.StaleDataSize != 0 {
+		i = encodeVarintPb(dAtA, i, uint64(m.StaleDataSize))
+		i--
+		dAtA[i] = 0x28
+	}
+	if m.KeyCount != 0 {
+		i = encodeVarintPb(dAtA, i, uint64(m.KeyCount))
+		i--
+		dAtA[i] = 0x20
+	}
+	if m.MaxVersion != 0 {
+		i = encodeVarintPb(dAtA, i, uint64(m.MaxVersion))
+		i--
+		dAtA[i] = 0x18
+	}
+	if len(m.BloomFilter) > 0 {
+		i -= len(m.BloomFilter)
+		copy(dAtA[i:], m.BloomFilter)
+		i = encodeVarintPb(dAtA, i, uint64(len(m.BloomFilter)))
+		i--
+		dAtA[i] = 0x12
+	}
+	if len(m.Offsets) > 0 {
+		for iNdEx := len(m.Offsets) - 1; iNdEx >= 0; iNdEx-- {
+			{
+				size, err := m.Offsets[iNdEx].MarshalToSizedBuffer(dAtA[:i])
+				if err != nil {
+					return 0, err
+				}
+				i -= size
+				i = encodeVarintPb(dAtA, i, uint64(size))
+			}
+			i--
+			dAtA[i] = 0xa
+		}
+	}
+	return len(dAtA) - i, nil
+}
+
+func (m *BlockOffset) Marshal() (dAtA []byte, err error) {
+	size := m.Size()
+	dAtA = make([]byte, size)
+	n, err := m.MarshalToSizedBuffer(dAtA[:size])
+	if err != nil {
+		return nil, err
+	}
+	return dAtA[:n], nil
+}
+
+func (m *BlockOffset) MarshalTo(dAtA []byte) (int, error) {
+	size := m.Size()
+	return m.MarshalToSizedBuffer(dAtA[:size])
+}
+
+func (m *BlockOffset) MarshalToSizedBuffer(dAtA []byte) (int, error) {
+	i := len(dAtA)
+	_ = i
+	var l int
+	_ = l
+	if m.XXX_unrecognized != nil {
+		i -= len(m.XXX_unrecognized)
+		copy(dAtA[i:], m.XXX_unrecognized)
+	}
+	if m.Len != 0 {
+		i = encodeVarintPb(dAtA, i, uint64(m.Len))
+		i--
+		dAtA[i] = 0x18
+	}
+	if m.Offset != 0 {
+		i = encodeVarintPb(dAtA, i, uint64(m.Offset))
+		i--
+		dAtA[i] = 0x10
+	}
+	if len(m.Key) > 0 {
+		i -= len(m.Key)
+		copy(dAtA[i:], m.Key)
+		i = encodeVarintPb(dAtA, i, uint64(len(m.Key)))
+		i--
+		dAtA[i] = 0xa
+	}
+	return len(dAtA) - i, nil
+}
+
+func encodeVarintPb(dAtA []byte, offset int, v uint64) int {
+	offset -= sovPb(v)
+	base := offset
+	for v >= 1<<7 {
+		dAtA[offset] = uint8(v&0x7f | 0x80)
+		v >>= 7
+		offset++
+	}
+	dAtA[offset] = uint8(v)
+	return base
+}
+func (m *KV) Size() (n int) {
+	if m == nil {
+		return 0
+	}
+	var l int
+	_ = l
+	l = len(m.Key)
+	if l > 0 {
+		n += 1 + l + sovPb(uint64(l))
+	}
+	l = len(m.Value)
+	if l > 0 {
+		n += 1 + l + sovPb(uint64(l))
+	}
+	l = len(m.UserMeta)
+	if l > 0 {
+		n += 1 + l + sovPb(uint64(l))
+	}
+	if m.Version != 0 {
+		n += 1 + sovPb(uint64(m.Version))
+	}
+	if m.ExpiresAt != 0 {
+		n += 1 + sovPb(uint64(m.ExpiresAt))
+	}
+	l = len(m.Meta)
+	if l > 0 {
+		n += 1 + l + sovPb(uint64(l))
+	}
+	if m.StreamId != 0 {
+		n += 1 + sovPb(uint64(m.StreamId))
+	}
+	if m.XXX_unrecognized != nil {
+		n += len(m.XXX_unrecognized)
+	}
+	return n
+}
+
+func (m *KVList) Size() (n int) {
+	if m == nil {
+		return 0
+	}
+	var l int
+	_ = l
+	if len(m.Kv) > 0 {
+		for _, e := range m.Kv {
+			l = e.Size()
+			n += 1 + l + sovPb(uint64(l))
+		}
+	}
+	if m.XXX_unrecognized != nil {
+		n += len(m.XXX_unrecognized)
+	}
+	return n
+}
+
+func (m *ManifestChangeSet) Size() (n int) {
+	if m == nil {
+		return 0
+	}
+	var l int
+	_ = l
+	if len(m.Changes) > 0 {
+		for _, e := range m.Changes {
+			l = e.Size()
+			n += 1 + l + sovPb(uint64(l))
+		}
+	}
+	if m.XXX_unrecognized != nil {
+		n += len(m.XXX_unrecognized)
+	}
+	return n
+}
+
+func (m *ManifestChange) Size() (n int) {
+	if m == nil {
+		return 0
+	}
+	var l int
+	_ = l
+	if m.Id != 0 {
+		n += 1 + sovPb(uint64(m.Id))
+	}
+	if m.Op != 0 {
+		n += 1 + sovPb(uint64(m.Op))
+	}
+	if m.Level != 0 {
+		n += 1 + sovPb(uint64(m.Level))
+	}
+	l = len(m.Checksum)
+	if l > 0 {
+		n += 1 + l + sovPb(uint64(l))
+	}
+	if m.XXX_unrecognized != nil {
+		n += len(m.XXX_unrecognized)
+	}
+	return n
+}
+
+func (m *TableIndex) Size() (n int) {
+	if m == nil {
+		return 0
+	}
+	var l int
+	_ = l
+	if len(m.Offsets) > 0 {
+		for _, e := range m.Offsets {
+			l = e.Size()
+			n += 1 + l + sovPb(uint64(l))
+		}
+	}
+	l = len(m.BloomFilter)
+	if l > 0 {
+		n += 1 + l + sovPb(uint64(l))
+	}
+	if m.MaxVersion != 0 {
+		n += 1 + sovPb(uint64(m.MaxVersion))
+	}
+	if m.KeyCount != 0 {
+		n += 1 + sovPb(uint64(m.KeyCount))
+	}
+	if m.StaleDataSize != 0 {
+		n += 1 + sovPb(uint64(m.StaleDataSize))
+	}
+	if m.XXX_unrecognized != nil {
+		n += len(m.XXX_unrecognized)
+	}
+	return n
+}
+
+func (m *BlockOffset) Size() (n int) {
+	if m == nil {
+		return 0
+	}
+	var l int
+	_ = l
+	l = len(m.Key)
+	if l > 0 {
+		n += 1 + l + sovPb(uint64(l))
+	}
+	if m.Offset != 0 {
+		n += 1 + sovPb(uint64(m.Offset))
+	}
+	if m.Len != 0 {
+		n += 1 + sovPb(uint64(m.Len))
+	}
+	if m.XXX_unrecognized != nil {
+		n += len(m.XXX_unrecognized)
+	}
+	return n
+}
+
+func sovPb(x uint64) (n int) {
+	return (math_bits.Len64(x|1) + 6) / 7
+}
+func sozPb(x uint64) (n int) {
+	return sovPb(uint64((x << 1) ^ uint64((int64(x) >> 63))))
+}
+func (m *KV) Unmarshal(dAtA []byte) error {
+	l := len(dAtA)
+	iNdEx := 0
+	for iNdEx < l {
+		preIndex := iNdEx
+		var wire uint64
+		for shift := uint(0); ; shift += 7 {
+			if shift >= 64 {
+				return ErrIntOverflowPb
+			}
+			if iNdEx >= l {
+				return io.ErrUnexpectedEOF
+			}
+			b := dAtA[iNdEx]
+			iNdEx++
+			wire |= uint64(b&0x7F) << shift
+			if b < 0x80 {
+				break
+			}
+		}
+		fieldNum := int32(wire >> 3)
+		wireType := int(wire & 0x7)
+		if wireType == 4 {
+			return fmt.Errorf("proto: KV: wiretype end group for non-group")
+		}
+		if fieldNum <= 0 {
+			return fmt.Errorf("proto: KV: illegal tag %d (wire type %d)", fieldNum, wire)
+		}
+		switch fieldNum {
+		case 1:
+			if wireType != 2 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Key", wireType)
+			}
+			var byteLen int
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowPb
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				byteLen |= int(b&0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			if byteLen < 0 {
+				return ErrInvalidLengthPb
+			}
+			postIndex := iNdEx + byteLen
+			if postIndex < 0 {
+				return ErrInvalidLengthPb
+			}
+			if postIndex > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.Key = append(m.Key[:0], dAtA[iNdEx:postIndex]...)
+			if m.Key == nil {
+				m.Key = []byte{}
+			}
+			iNdEx = postIndex
+		case 2:
+			if wireType != 2 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Value", wireType)
+			}
+			var byteLen int
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowPb
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				byteLen |= int(b&0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			if byteLen < 0 {
+				return ErrInvalidLengthPb
+			}
+			postIndex := iNdEx + byteLen
+			if postIndex < 0 {
+				return ErrInvalidLengthPb
+			}
+			if postIndex > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.Value = append(m.Value[:0], dAtA[iNdEx:postIndex]...)
+			if m.Value == nil {
+				m.Value = []byte{}
+			}
+			iNdEx = postIndex
+		case 3:
+			if wireType != 2 {
+				return fmt.Errorf("proto: wrong wireType = %d for field UserMeta", wireType)
+			}
+			var byteLen int
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowPb
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				byteLen |= int(b&0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			if byteLen < 0 {
+				return ErrInvalidLengthPb
+			}
+			postIndex := iNdEx + byteLen
+			if postIndex < 0 {
+				return ErrInvalidLengthPb
+			}
+			if postIndex > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.UserMeta = append(m.UserMeta[:0], dAtA[iNdEx:postIndex]...)
+			if m.UserMeta == nil {
+				m.UserMeta = []byte{}
+			}
+			iNdEx = postIndex
+		case 4:
+			if wireType != 0 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Version", wireType)
+			}
+			m.Version = 0
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowPb
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				m.Version |= uint64(b&0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+		case 5:
+			if wireType != 0 {
+				return fmt.Errorf("proto: wrong wireType = %d for field ExpiresAt", wireType)
+			}
+			m.ExpiresAt = 0
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowPb
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				m.ExpiresAt |= uint64(b&0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+		case 6:
+			if wireType != 2 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Meta", wireType)
+			}
+			var byteLen int
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowPb
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				byteLen |= int(b&0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			if byteLen < 0 {
+				return ErrInvalidLengthPb
+			}
+			postIndex := iNdEx + byteLen
+			if postIndex < 0 {
+				return ErrInvalidLengthPb
+			}
+			if postIndex > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.Meta = append(m.Meta[:0], dAtA[iNdEx:postIndex]...)
+			if m.Meta == nil {
+				m.Meta = []byte{}
+			}
+			iNdEx = postIndex
+		case 10:
+			if wireType != 0 {
+				return fmt.Errorf("proto: wrong wireType = %d for field StreamId", wireType)
+			}
+			m.StreamId = 0
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowPb
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				m.StreamId |= uint32(b&0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+		default:
+			iNdEx = preIndex
+			skippy, err := skipPb(dAtA[iNdEx:])
+			if err != nil {
+				return err
+			}
+			if (skippy < 0) || (iNdEx+skippy) < 0 {
+				return ErrInvalidLengthPb
+			}
+			if (iNdEx + skippy) > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...)
+			iNdEx += skippy
+		}
+	}
+
+	if iNdEx > l {
+		return io.ErrUnexpectedEOF
+	}
+	return nil
+}
+func (m *KVList) Unmarshal(dAtA []byte) error {
+	l := len(dAtA)
+	iNdEx := 0
+	for iNdEx < l {
+		preIndex := iNdEx
+		var wire uint64
+		for shift := uint(0); ; shift += 7 {
+			if shift >= 64 {
+				return ErrIntOverflowPb
+			}
+			if iNdEx >= l {
+				return io.ErrUnexpectedEOF
+			}
+			b := dAtA[iNdEx]
+			iNdEx++
+			wire |= uint64(b&0x7F) << shift
+			if b < 0x80 {
+				break
+			}
+		}
+		fieldNum := int32(wire >> 3)
+		wireType := int(wire & 0x7)
+		if wireType == 4 {
+			return fmt.Errorf("proto: KVList: wiretype end group for non-group")
+		}
+		if fieldNum <= 0 {
+			return fmt.Errorf("proto: KVList: illegal tag %d (wire type %d)", fieldNum, wire)
+		}
+		switch fieldNum {
+		case 1:
+			if wireType != 2 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Kv", wireType)
+			}
+			var msglen int
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowPb
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				msglen |= int(b&0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			if msglen < 0 {
+				return ErrInvalidLengthPb
+			}
+			postIndex := iNdEx + msglen
+			if postIndex < 0 {
+				return ErrInvalidLengthPb
+			}
+			if postIndex > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.Kv = append(m.Kv, &KV{})
+			if err := m.Kv[len(m.Kv)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
+				return err
+			}
+			iNdEx = postIndex
+		default:
+			iNdEx = preIndex
+			skippy, err := skipPb(dAtA[iNdEx:])
+			if err != nil {
+				return err
+			}
+			if (skippy < 0) || (iNdEx+skippy) < 0 {
+				return ErrInvalidLengthPb
+			}
+			if (iNdEx + skippy) > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...)
+			iNdEx += skippy
+		}
+	}
+
+	if iNdEx > l {
+		return io.ErrUnexpectedEOF
+	}
+	return nil
+}
+func (m *ManifestChangeSet) Unmarshal(dAtA []byte) error {
+	l := len(dAtA)
+	iNdEx := 0
+	for iNdEx < l {
+		preIndex := iNdEx
+		var wire uint64
+		for shift := uint(0); ; shift += 7 {
+			if shift >= 64 {
+				return ErrIntOverflowPb
+			}
+			if iNdEx >= l {
+				return io.ErrUnexpectedEOF
+			}
+			b := dAtA[iNdEx]
+			iNdEx++
+			wire |= uint64(b&0x7F) << shift
+			if b < 0x80 {
+				break
+			}
+		}
+		fieldNum := int32(wire >> 3)
+		wireType := int(wire & 0x7)
+		if wireType == 4 {
+			return fmt.Errorf("proto: ManifestChangeSet: wiretype end group for non-group")
+		}
+		if fieldNum <= 0 {
+			return fmt.Errorf("proto: ManifestChangeSet: illegal tag %d (wire type %d)", fieldNum, wire)
+		}
+		switch fieldNum {
+		case 1:
+			if wireType != 2 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Changes", wireType)
+			}
+			var msglen int
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowPb
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				msglen |= int(b&0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			if msglen < 0 {
+				return ErrInvalidLengthPb
+			}
+			postIndex := iNdEx + msglen
+			if postIndex < 0 {
+				return ErrInvalidLengthPb
+			}
+			if postIndex > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.Changes = append(m.Changes, &ManifestChange{})
+			if err := m.Changes[len(m.Changes)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
+				return err
+			}
+			iNdEx = postIndex
+		default:
+			iNdEx = preIndex
+			skippy, err := skipPb(dAtA[iNdEx:])
+			if err != nil {
+				return err
+			}
+			if (skippy < 0) || (iNdEx+skippy) < 0 {
+				return ErrInvalidLengthPb
+			}
+			if (iNdEx + skippy) > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...)
+			iNdEx += skippy
+		}
+	}
+
+	if iNdEx > l {
+		return io.ErrUnexpectedEOF
+	}
+	return nil
+}
+func (m *ManifestChange) Unmarshal(dAtA []byte) error {
+	l := len(dAtA)
+	iNdEx := 0
+	for iNdEx < l {
+		preIndex := iNdEx
+		var wire uint64
+		for shift := uint(0); ; shift += 7 {
+			if shift >= 64 {
+				return ErrIntOverflowPb
+			}
+			if iNdEx >= l {
+				return io.ErrUnexpectedEOF
+			}
+			b := dAtA[iNdEx]
+			iNdEx++
+			wire |= uint64(b&0x7F) << shift
+			if b < 0x80 {
+				break
+			}
+		}
+		fieldNum := int32(wire >> 3)
+		wireType := int(wire & 0x7)
+		if wireType == 4 {
+			return fmt.Errorf("proto: ManifestChange: wiretype end group for non-group")
+		}
+		if fieldNum <= 0 {
+			return fmt.Errorf("proto: ManifestChange: illegal tag %d (wire type %d)", fieldNum, wire)
+		}
+		switch fieldNum {
+		case 1:
+			if wireType != 0 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Id", wireType)
+			}
+			m.Id = 0
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowPb
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				m.Id |= uint64(b&0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+		case 2:
+			if wireType != 0 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Op", wireType)
+			}
+			m.Op = 0
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowPb
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				m.Op |= ManifestChange_Operation(b&0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+		case 3:
+			if wireType != 0 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Level", wireType)
+			}
+			m.Level = 0
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowPb
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				m.Level |= uint32(b&0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+		case 4:
+			if wireType != 2 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Checksum", wireType)
+			}
+			var byteLen int
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowPb
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				byteLen |= int(b&0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			if byteLen < 0 {
+				return ErrInvalidLengthPb
+			}
+			postIndex := iNdEx + byteLen
+			if postIndex < 0 {
+				return ErrInvalidLengthPb
+			}
+			if postIndex > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.Checksum = append(m.Checksum[:0], dAtA[iNdEx:postIndex]...)
+			if m.Checksum == nil {
+				m.Checksum = []byte{}
+			}
+			iNdEx = postIndex
+		default:
+			iNdEx = preIndex
+			skippy, err := skipPb(dAtA[iNdEx:])
+			if err != nil {
+				return err
+			}
+			if (skippy < 0) || (iNdEx+skippy) < 0 {
+				return ErrInvalidLengthPb
+			}
+			if (iNdEx + skippy) > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...)
+			iNdEx += skippy
+		}
+	}
+
+	if iNdEx > l {
+		return io.ErrUnexpectedEOF
+	}
+	return nil
+}
+func (m *TableIndex) Unmarshal(dAtA []byte) error {
+	l := len(dAtA)
+	iNdEx := 0
+	for iNdEx < l {
+		preIndex := iNdEx
+		var wire uint64
+		for shift := uint(0); ; shift += 7 {
+			if shift >= 64 {
+				return ErrIntOverflowPb
+			}
+			if iNdEx >= l {
+				return io.ErrUnexpectedEOF
+			}
+			b := dAtA[iNdEx]
+			iNdEx++
+			wire |= uint64(b&0x7F) << shift
+			if b < 0x80 {
+				break
+			}
+		}
+		fieldNum := int32(wire >> 3)
+		wireType := int(wire & 0x7)
+		if wireType == 4 {
+			return fmt.Errorf("proto: TableIndex: wiretype end group for non-group")
+		}
+		if fieldNum <= 0 {
+			return fmt.Errorf("proto: TableIndex: illegal tag %d (wire type %d)", fieldNum, wire)
+		}
+		switch fieldNum {
+		case 1:
+			if wireType != 2 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Offsets", wireType)
+			}
+			var msglen int
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowPb
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				msglen |= int(b&0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			if msglen < 0 {
+				return ErrInvalidLengthPb
+			}
+			postIndex := iNdEx + msglen
+			if postIndex < 0 {
+				return ErrInvalidLengthPb
+			}
+			if postIndex > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.Offsets = append(m.Offsets, &BlockOffset{})
+			if err := m.Offsets[len(m.Offsets)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
+				return err
+			}
+			iNdEx = postIndex
+		case 2:
+			if wireType != 2 {
+				return fmt.Errorf("proto: wrong wireType = %d for field BloomFilter", wireType)
+			}
+			var byteLen int
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowPb
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				byteLen |= int(b&0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			if byteLen < 0 {
+				return ErrInvalidLengthPb
+			}
+			postIndex := iNdEx + byteLen
+			if postIndex < 0 {
+				return ErrInvalidLengthPb
+			}
+			if postIndex > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.BloomFilter = append(m.BloomFilter[:0], dAtA[iNdEx:postIndex]...)
+			if m.BloomFilter == nil {
+				m.BloomFilter = []byte{}
+			}
+			iNdEx = postIndex
+		case 3:
+			if wireType != 0 {
+				return fmt.Errorf("proto: wrong wireType = %d for field MaxVersion", wireType)
+			}
+			m.MaxVersion = 0
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowPb
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				m.MaxVersion |= uint64(b&0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+		case 4:
+			if wireType != 0 {
+				return fmt.Errorf("proto: wrong wireType = %d for field KeyCount", wireType)
+			}
+			m.KeyCount = 0
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowPb
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				m.KeyCount |= uint32(b&0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+		case 5:
+			if wireType != 0 {
+				return fmt.Errorf("proto: wrong wireType = %d for field StaleDataSize", wireType)
+			}
+			m.StaleDataSize = 0
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowPb
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				m.StaleDataSize |= uint32(b&0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+		default:
+			iNdEx = preIndex
+			skippy, err := skipPb(dAtA[iNdEx:])
+			if err != nil {
+				return err
+			}
+			if (skippy < 0) || (iNdEx+skippy) < 0 {
+				return ErrInvalidLengthPb
+			}
+			if (iNdEx + skippy) > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...)
+			iNdEx += skippy
+		}
+	}
+
+	if iNdEx > l {
+		return io.ErrUnexpectedEOF
+	}
+	return nil
+}
+func (m *BlockOffset) Unmarshal(dAtA []byte) error {
+	l := len(dAtA)
+	iNdEx := 0
+	for iNdEx < l {
+		preIndex := iNdEx
+		var wire uint64
+		for shift := uint(0); ; shift += 7 {
+			if shift >= 64 {
+				return ErrIntOverflowPb
+			}
+			if iNdEx >= l {
+				return io.ErrUnexpectedEOF
+			}
+			b := dAtA[iNdEx]
+			iNdEx++
+			wire |= uint64(b&0x7F) << shift
+			if b < 0x80 {
+				break
+			}
+		}
+		fieldNum := int32(wire >> 3)
+		wireType := int(wire & 0x7)
+		if wireType == 4 {
+			return fmt.Errorf("proto: BlockOffset: wiretype end group for non-group")
+		}
+		if fieldNum <= 0 {
+			return fmt.Errorf("proto: BlockOffset: illegal tag %d (wire type %d)", fieldNum, wire)
+		}
+		switch fieldNum {
+		case 1:
+			if wireType != 2 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Key", wireType)
+			}
+			var byteLen int
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowPb
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				byteLen |= int(b&0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			if byteLen < 0 {
+				return ErrInvalidLengthPb
+			}
+			postIndex := iNdEx + byteLen
+			if postIndex < 0 {
+				return ErrInvalidLengthPb
+			}
+			if postIndex > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.Key = append(m.Key[:0], dAtA[iNdEx:postIndex]...)
+			if m.Key == nil {
+				m.Key = []byte{}
+			}
+			iNdEx = postIndex
+		case 2:
+			if wireType != 0 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Offset", wireType)
+			}
+			m.Offset = 0
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowPb
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				m.Offset |= uint32(b&0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+		case 3:
+			if wireType != 0 {
+				return fmt.Errorf("proto: wrong wireType = %d for field Len", wireType)
+			}
+			m.Len = 0
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return ErrIntOverflowPb
+				}
+				if iNdEx >= l {
+					return io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				m.Len |= uint32(b&0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+		default:
+			iNdEx = preIndex
+			skippy, err := skipPb(dAtA[iNdEx:])
+			if err != nil {
+				return err
+			}
+			if (skippy < 0) || (iNdEx+skippy) < 0 {
+				return ErrInvalidLengthPb
+			}
+			if (iNdEx + skippy) > l {
+				return io.ErrUnexpectedEOF
+			}
+			m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...)
+			iNdEx += skippy
+		}
+	}
+
+	if iNdEx > l {
+		return io.ErrUnexpectedEOF
+	}
+	return nil
+}
+func skipPb(dAtA []byte) (n int, err error) {
+	l := len(dAtA)
+	iNdEx := 0
+	depth := 0
+	for iNdEx < l {
+		var wire uint64
+		for shift := uint(0); ; shift += 7 {
+			if shift >= 64 {
+				return 0, ErrIntOverflowPb
+			}
+			if iNdEx >= l {
+				return 0, io.ErrUnexpectedEOF
+			}
+			b := dAtA[iNdEx]
+			iNdEx++
+			wire |= (uint64(b) & 0x7F) << shift
+			if b < 0x80 {
+				break
+			}
+		}
+		wireType := int(wire & 0x7)
+		switch wireType {
+		case 0:
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return 0, ErrIntOverflowPb
+				}
+				if iNdEx >= l {
+					return 0, io.ErrUnexpectedEOF
+				}
+				iNdEx++
+				if dAtA[iNdEx-1] < 0x80 {
+					break
+				}
+			}
+		case 1:
+			iNdEx += 8
+		case 2:
+			var length int
+			for shift := uint(0); ; shift += 7 {
+				if shift >= 64 {
+					return 0, ErrIntOverflowPb
+				}
+				if iNdEx >= l {
+					return 0, io.ErrUnexpectedEOF
+				}
+				b := dAtA[iNdEx]
+				iNdEx++
+				length |= (int(b) & 0x7F) << shift
+				if b < 0x80 {
+					break
+				}
+			}
+			if length < 0 {
+				return 0, ErrInvalidLengthPb
+			}
+			iNdEx += length
+		case 3:
+			depth++
+		case 4:
+			if depth == 0 {
+				return 0, ErrUnexpectedEndOfGroupPb
+			}
+			depth--
+		case 5:
+			iNdEx += 4
+		default:
+			return 0, fmt.Errorf("proto: illegal wireType %d", wireType)
+		}
+		if iNdEx < 0 {
+			return 0, ErrInvalidLengthPb
+		}
+		if depth == 0 {
+			return iNdEx, nil
+		}
+	}
+	return 0, io.ErrUnexpectedEOF
+}
+
+var (
+	ErrInvalidLengthPb        = fmt.Errorf("proto: negative length found during unmarshaling")
+	ErrIntOverflowPb          = fmt.Errorf("proto: integer overflow")
+	ErrUnexpectedEndOfGroupPb = fmt.Errorf("proto: unexpected end of group")
+)
diff --git a/pb/pb.proto b/pb/pb.proto
new file mode 100644
index 0000000..63c9408
--- /dev/null
+++ b/pb/pb.proto
@@ -0,0 +1,64 @@
+/*
+ * Copyright hardcore-os Project Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License")
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Use protos/gen.sh to generate .pb.go files.
+syntax = "proto3";
+
+package pb;
+
+message KV {
+  bytes key = 1;
+  bytes value = 2;
+  bytes user_meta = 3;
+  uint64 version = 4;
+  uint64 expires_at = 5;
+  bytes meta = 6;
+
+  // Stream id is used to identify which stream the KV came from.
+  uint32 stream_id = 10;
+}
+
+message KVList {
+  repeated KV kv = 1;
+}
+
+message ManifestChangeSet {
+        // A set of changes that are applied atomically.
+        repeated ManifestChange changes = 1;
+}
+
+message ManifestChange {
+        uint64 Id = 1;
+        enum Operation {
+                CREATE = 0;
+                DELETE = 1;
+        }
+        Operation Op   = 2;
+        uint32 Level   = 3; // Only used for CREATE
+        bytes Checksum = 4; // Only used for CREATE
+}
+message TableIndex{
+        repeated BlockOffset offsets = 1;
+        bytes  bloomFilter = 2;
+        uint64 maxVersion = 3;
+        uint32 keyCount = 4;
+        uint32 staleDataSize = 5;
+}
+
+message BlockOffset{
+        bytes key = 1;
+        uint32 offset = 2;
+        uint32 len = 3;
+}
\ No newline at end of file
diff --git a/stats.go b/stats.go
index 5c81362..2d935a2 100644
--- a/stats.go
+++ b/stats.go
@@ -1,3 +1,17 @@
+// Copyright 2021 logicrec Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package corekv
 
 import "github.com/hardcore-os/corekv/utils"
@@ -17,7 +31,8 @@ func (s *Stats) StartStats() {
 	defer s.closer.Done()
 	for {
 		select {
-		case <-s.closer.Wait():
+		case <-s.closer.CloseSignal:
+			return
 		}
 		// stats logic...
 	}
@@ -26,7 +41,7 @@ func (s *Stats) StartStats() {
 // NewStats
 func newStats(opt *Options) *Stats {
 	s := &Stats{}
-	s.closer = utils.NewCloser(1)
-	s.EntryNum = 1 // 这里直接写1
+	s.closer = utils.NewCloser()
+	s.EntryNum = 1 // 这里直接写
 	return s
 }
diff --git a/utils/arena.go b/utils/arena.go
new file mode 100644
index 0000000..48d357d
--- /dev/null
+++ b/utils/arena.go
@@ -0,0 +1,158 @@
+/*
+ * Copyright 2017 Dgraph Labs, Inc. and Contributors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package utils
+
+import (
+	"github.com/pkg/errors"
+	"log"
+	"sync/atomic"
+	"unsafe"
+)
+
+const (
+	offsetSize = int(unsafe.Sizeof(uint32(0)))
+
+	// Always align nodes on 64-bit boundaries, even on 32-bit architectures,
+	// so that the node.value field is 64-bit aligned. This is necessary because
+	// node.getValueOffset uses atomic.LoadUint64, which expects its input
+	// pointer to be 64-bit aligned.
+	nodeAlign = int(unsafe.Sizeof(uint64(0))) - 1
+
+	MaxNodeSize = int(unsafe.Sizeof(node{}))
+)
+
+// Arena should be lock-free.
+type Arena struct {
+	n          uint32
+	shouldGrow bool
+	buf        []byte
+}
+
+// newArena returns a new arena.
+func newArena(n int64) *Arena {
+	// Don't store data at position 0 in order to reserve offset=0 as a kind
+	// of nil pointer.
+	out := &Arena{
+		n:   1,
+		buf: make([]byte, n),
+	}
+	return out
+}
+
+func (s *Arena) allocate(sz uint32) uint32 {
+	offset := atomic.AddUint32(&s.n, sz)
+	if !s.shouldGrow {
+		AssertTrue(int(offset) <= len(s.buf))
+		return offset - sz
+	}
+
+	// We are keeping extra bytes in the end so that the checkptr doesn't fail. We apply some
+	// intelligence to reduce the size of the node by only keeping towers upto valid height and not
+	// maxHeight. This reduces the node's size, but checkptr doesn't know about its reduced size.
+	// checkptr tries to verify that the node of size MaxNodeSize resides on a single heap
+	// allocation which causes this error: checkptr:converted pointer straddles multiple allocations
+	if int(offset) > len(s.buf)-MaxNodeSize {
+		growBy := uint32(len(s.buf))
+		if growBy > 1<<30 {
+			growBy = 1 << 30
+		}
+		if growBy < sz {
+			growBy = sz
+		}
+		newBuf := make([]byte, len(s.buf)+int(growBy))
+		AssertTrue(len(s.buf) == copy(newBuf, s.buf))
+		s.buf = newBuf
+	}
+	return offset - sz
+}
+
+func (s *Arena) size() int64 {
+	return int64(atomic.LoadUint32(&s.n))
+}
+
+// putNode allocates a node in the arena. The node is aligned on a pointer-sized
+// boundary. The arena offset of the node is returned.
+func (s *Arena) putNode(height int) uint32 {
+	// Compute the amount of the tower that will never be used, since the height
+	// is less than maxHeight.
+	unusedSize := (maxHeight - height) * offsetSize
+
+	// Pad the allocation with enough bytes to ensure pointer alignment.
+	l := uint32(MaxNodeSize - unusedSize + nodeAlign)
+	n := s.allocate(l)
+
+	// Return the aligned offset.
+	m := (n + uint32(nodeAlign)) & ^uint32(nodeAlign)
+	return m
+}
+
+// Put will *copy* val into arena. To make better use of this, reuse your input
+// val buffer. Returns an offset into buf. User is responsible for remembering
+// size of val. We could also store this size inside arena but the encoding and
+// decoding will incur some overhead.
+func (s *Arena) putVal(v ValueStruct) uint32 {
+	l := uint32(v.EncodedSize())
+	offset := s.allocate(l)
+	v.EncodeValue(s.buf[offset:])
+	return offset
+}
+
+func (s *Arena) putKey(key []byte) uint32 {
+	keySz := uint32(len(key))
+	offset := s.allocate(keySz)
+	buf := s.buf[offset : offset+keySz]
+	AssertTrue(len(key) == copy(buf, key))
+	return offset
+}
+
+// getNode returns a pointer to the node located at offset. If the offset is
+// zero, then the nil node pointer is returned.
+func (s *Arena) getNode(offset uint32) *node {
+	if offset == 0 {
+		return nil
+	}
+	return (*node)(unsafe.Pointer(&s.buf[offset]))
+}
+
+// getKey returns byte slice at offset.
+func (s *Arena) getKey(offset uint32, size uint16) []byte {
+	return s.buf[offset : offset+uint32(size)]
+}
+
+// getVal returns byte slice at offset. The given size should be just the value
+// size and should NOT include the meta bytes.
+func (s *Arena) getVal(offset uint32, size uint32) (ret ValueStruct) {
+	ret.DecodeValue(s.buf[offset : offset+size])
+	return
+}
+
+// getNodeOffset returns the offset of node in the arena. If the node pointer is
+// nil, then the zero offset is returned.
+func (s *Arena) getNodeOffset(nd *node) uint32 {
+	if nd == nil {
+		return 0
+	}
+
+	return uint32(uintptr(unsafe.Pointer(nd)) - uintptr(unsafe.Pointer(&s.buf[0])))
+}
+
+// AssertTrue asserts that b is true. Otherwise, it would log fatal.
+func AssertTrue(b bool) {
+	if !b {
+		log.Fatalf("%+v", errors.Errorf("Assert failed"))
+	}
+}
diff --git a/utils/bloom.go b/utils/bloom.go
new file mode 100644
index 0000000..8ffb9c8
--- /dev/null
+++ b/utils/bloom.go
@@ -0,0 +1,131 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import "math"
+
+// Filter is an encoded set of []byte keys.
+type Filter []byte
+
+// MayContainKey _
+func (f Filter) MayContainKey(k []byte) bool {
+	return f.MayContain(Hash(k))
+}
+
+// MayContain returns whether the filter may contain given key. False positives
+// are possible, where it returns true for keys not in the original set.
+func (f Filter) MayContain(h uint32) bool {
+	if len(f) < 2 {
+		return false
+	}
+	k := f[len(f)-1]
+	if k > 30 {
+		// This is reserved for potentially new encodings for short Bloom filters.
+		// Consider it a match.
+		return true
+	}
+	nBits := uint32(8 * (len(f) - 1))
+	delta := h>>17 | h<<15
+	for j := uint8(0); j < k; j++ {
+		bitPos := h % nBits
+		if f[bitPos/8]&(1<<(bitPos%8)) == 0 {
+			return false
+		}
+		h += delta
+	}
+	return true
+}
+
+// NewFilter returns a new Bloom filter that encodes a set of []byte keys with
+// the given number of bits per key, approximately.
+//
+// A good bitsPerKey value is 10, which yields a filter with ~ 1% false
+// positive rate.
+func NewFilter(keys []uint32, bitsPerKey int) Filter {
+	return Filter(appendFilter(keys, bitsPerKey))
+}
+
+// BloomBitsPerKey returns the bits per key required by bloomfilter based on
+// the false positive rate.
+func BloomBitsPerKey(numEntries int, fp float64) int {
+	size := -1 * float64(numEntries) * math.Log(fp) / math.Pow(float64(0.69314718056), 2)
+	locs := math.Ceil(size / float64(numEntries))
+	return int(locs)
+}
+
+func appendFilter(keys []uint32, bitsPerKey int) []byte {
+	if bitsPerKey < 0 {
+		bitsPerKey = 0
+	}
+	// 0.69 is approximately ln(2).
+	k := uint32(float64(bitsPerKey) * 0.69)
+	if k < 1 {
+		k = 1
+	}
+	if k > 30 {
+		k = 30
+	}
+
+	nBits := len(keys) * int(bitsPerKey)
+	// For small len(keys), we can see a very high false positive rate. Fix it
+	// by enforcing a minimum bloom filter length.
+	if nBits < 64 {
+		nBits = 64
+	}
+	nBytes := (nBits + 7) / 8
+	nBits = nBytes * 8
+	filter := make([]byte, nBytes+1)
+
+	for _, h := range keys {
+		delta := h>>17 | h<<15
+		for j := uint32(0); j < k; j++ {
+			bitPos := h % uint32(nBits)
+			filter[bitPos/8] |= 1 << (bitPos % 8)
+			h += delta
+		}
+	}
+
+	//record the K value of this Bloom Filter
+	filter[nBytes] = uint8(k)
+
+	return filter
+}
+
+// Hash implements a hashing algorithm similar to the Murmur hash.
+func Hash(b []byte) uint32 {
+	const (
+		seed = 0xbc9f1d34
+		m    = 0xc6a4a793
+	)
+	h := uint32(seed) ^ uint32(len(b))*m
+	for ; len(b) >= 4; b = b[4:] {
+		h += uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+		h *= m
+		h ^= h >> 16
+	}
+	switch len(b) {
+	case 3:
+		h += uint32(b[2]) << 16
+		fallthrough
+	case 2:
+		h += uint32(b[1]) << 8
+		fallthrough
+	case 1:
+		h += uint32(b[0])
+		h *= m
+		h ^= h >> 24
+	}
+	return h
+}
diff --git a/utils/bloom_test.go b/utils/bloom_test.go
new file mode 100644
index 0000000..339affe
--- /dev/null
+++ b/utils/bloom_test.go
@@ -0,0 +1,156 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package utils
+
+import (
+	"testing"
+)
+
+func (f Filter) String() string {
+	s := make([]byte, 8*len(f))
+	for i, x := range f {
+		for j := 0; j < 8; j++ {
+			if x&(1<<uint(j)) != 0 {
+				s[8*i+j] = '1'
+			} else {
+				s[8*i+j] = '.'
+			}
+		}
+	}
+	return string(s)
+}
+
+func TestSmallBloomFilter(t *testing.T) {
+	var hash []uint32
+	for _, word := range [][]byte{
+		[]byte("hello"),
+		[]byte("world"),
+	} {
+		hash = append(hash, Hash(word))
+	}
+
+	f := NewFilter(hash, 10)
+	got := f.String()
+	// The magic want string comes from running the C++ leveldb code's bloom_test.cc.
+	want := "1...1.........1.........1.....1...1...1.....1.........1.....1....11....."
+	if got != want {
+		t.Fatalf("bits:\ngot  %q\nwant %q", got, want)
+	}
+
+	m := map[string]bool{
+		"hello": true,
+		"world": true,
+		"x":     false,
+		"foo":   false,
+	}
+	for k, want := range m {
+		got := f.MayContainKey([]byte(k))
+		if got != want {
+			t.Errorf("MayContain: k=%q: got %v, want %v", k, got, want)
+		}
+	}
+}
+
+func TestBloomFilter(t *testing.T) {
+	nextLength := func(x int) int {
+		if x < 10 {
+			return x + 1
+		}
+		if x < 100 {
+			return x + 10
+		}
+		if x < 1000 {
+			return x + 100
+		}
+		return x + 1000
+	}
+	le32 := func(i int) []byte {
+		b := make([]byte, 4)
+		b[0] = uint8(uint32(i) >> 0)
+		b[1] = uint8(uint32(i) >> 8)
+		b[2] = uint8(uint32(i) >> 16)
+		b[3] = uint8(uint32(i) >> 24)
+		return b
+	}
+
+	nMediocreFilters, nGoodFilters := 0, 0
+loop:
+	for length := 1; length <= 10000; length = nextLength(length) {
+		keys := make([][]byte, 0, length)
+		for i := 0; i < length; i++ {
+			keys = append(keys, le32(i))
+		}
+		var hashes []uint32
+		for _, key := range keys {
+			hashes = append(hashes, Hash(key))
+		}
+		f := NewFilter(hashes, 10)
+
+		if len(f) > (length*10/8)+40 {
+			t.Errorf("length=%d: len(f)=%d is too large", length, len(f))
+			continue
+		}
+
+		// All added keys must match.
+		for _, key := range keys {
+			if !f.MayContainKey(key) {
+				t.Errorf("length=%d: did not contain key %q", length, key)
+				continue loop
+			}
+		}
+
+		// Check false positive rate.
+		nFalsePositive := 0
+		for i := 0; i < 10000; i++ {
+			if f.MayContainKey(le32(1e9 + i)) {
+				nFalsePositive++
+			}
+		}
+		if nFalsePositive > 0.02*10000 {
+			t.Errorf("length=%d: %d false positives in 10000", length, nFalsePositive)
+			continue
+		}
+		if nFalsePositive > 0.0125*10000 {
+			nMediocreFilters++
+		} else {
+			nGoodFilters++
+		}
+	}
+
+	if nMediocreFilters > nGoodFilters/5 {
+		t.Errorf("%d mediocre filters but only %d good filters", nMediocreFilters, nGoodFilters)
+	}
+}
+
+func TestHash(t *testing.T) {
+	// The magic want numbers come from running the C++ leveldb code in hash.cc.
+	testCases := []struct {
+		s    string
+		want uint32
+	}{
+		{"", 0xbc9f1d34},
+		{"g", 0xd04a8bda},
+		{"go", 0x3e0b0745},
+		{"gop", 0x0c326610},
+		{"goph", 0x8c9d6390},
+		{"gophe", 0x9bfd4b0a},
+		{"gopher", 0xa78edc7c},
+		{"I had a dream it would end this way.", 0xe14a9db9},
+	}
+	for _, tc := range testCases {
+		if got := Hash([]byte(tc.s)); got != tc.want {
+			t.Errorf("s=%q: got 0x%08x, want 0x%08x", tc.s, got, tc.want)
+		}
+	}
+}
diff --git a/utils/cache/bloom.go b/utils/cache/bloom.go
new file mode 100644
index 0000000..2bacc2d
--- /dev/null
+++ b/utils/cache/bloom.go
@@ -0,0 +1,187 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cache
+
+import "math"
+
+// Filter is an encoded set of []byte keys.
+type Filter []byte
+
+type BloomFilter struct {
+	bitmap Filter
+	k      uint8
+}
+
+// MayContainKey _
+func (f *BloomFilter) MayContainKey(k []byte) bool {
+	return f.MayContain(Hash(k))
+}
+
+// MayContain returns whether the filter may contain given key. False positives
+// are possible, where it returns true for keys not in the original set.
+func (f *BloomFilter) MayContain(h uint32) bool {
+	if f.Len() < 2 {
+		return false
+	}
+	k := f.k
+	if k > 30 {
+		// This is reserved for potentially new encodings for short Bloom filters.
+		// Consider it a match.
+		return true
+	}
+	nBits := uint32(8 * (f.Len() - 1))
+	delta := h>>17 | h<<15
+	for j := uint8(0); j < k; j++ {
+		bitPos := h % nBits
+		if f.bitmap[bitPos/8]&(1<<(bitPos%8)) == 0 {
+			return false
+		}
+		h += delta
+	}
+	return true
+}
+
+func (f *BloomFilter) Len() int32 {
+	return int32(len(f.bitmap))
+}
+
+func (f *BloomFilter) InsertKey(k []byte) bool {
+	return f.Insert(Hash(k))
+}
+
+func (f *BloomFilter) Insert(h uint32) bool {
+	k := f.k
+	if k > 30 {
+		// This is reserved for potentially new encodings for short Bloom filters.
+		// Consider it a match.
+		return true
+	}
+	nBits := uint32(8 * (f.Len() - 1))
+	delta := h>>17 | h<<15
+	for j := uint8(0); j < k; j++ {
+		bitPos := h % uint32(nBits)
+		f.bitmap[bitPos/8] |= 1 << (bitPos % 8)
+		h += delta
+	}
+	return true
+}
+
+func (f *BloomFilter) AllowKey(k []byte) bool {
+	if f == nil {
+		return true
+	}
+	already := f.MayContainKey(k)
+	if !already {
+		f.InsertKey(k)
+	}
+	return already
+}
+
+func (f *BloomFilter) Allow(h uint32) bool {
+	if f == nil {
+		return true
+	}
+	already := f.MayContain(h)
+	if !already {
+		f.Insert(h)
+	}
+	return already
+}
+
+func (f *BloomFilter) reset() {
+	if f == nil {
+		return
+	}
+	for i := range f.bitmap {
+		f.bitmap[i] = 0
+	}
+}
+
+// NewFilter returns a new Bloom filter that encodes a set of []byte keys with
+// the given number of bits per key, approximately.
+//
+// A good bitsPerKey value is 10, which yields a filter with ~ 1% false
+// positive rate.
+func newFilter(numEntries int, falsePositive float64) *BloomFilter {
+	bitsPerKey := bloomBitsPerKey(numEntries, falsePositive)
+	return initFilter(numEntries, bitsPerKey)
+}
+
+// BloomBitsPerKey returns the bits per key required by bloomfilter based on
+// the false positive rate.
+func bloomBitsPerKey(numEntries int, fp float64) int {
+	size := -1 * float64(numEntries) * math.Log(fp) / math.Pow(float64(0.69314718056), 2)
+	locs := math.Ceil(size / float64(numEntries))
+	return int(locs)
+}
+
+func initFilter(numEntries int, bitsPerKey int) *BloomFilter {
+	bf := &BloomFilter{}
+	if bitsPerKey < 0 {
+		bitsPerKey = 0
+	}
+	// 0.69 is approximately ln(2).
+	k := uint32(float64(bitsPerKey) * 0.69)
+	if k < 1 {
+		k = 1
+	}
+	if k > 30 {
+		k = 30
+	}
+	bf.k = uint8(k)
+
+	nBits := numEntries * int(bitsPerKey)
+	// For small len(keys), we can see a very high false positive rate. Fix it
+	// by enforcing a minimum bloom filter length.
+	if nBits < 64 {
+		nBits = 64
+	}
+	nBytes := (nBits + 7) / 8
+	nBits = nBytes * 8
+	filter := make([]byte, nBytes+1)
+
+	//record the K value of this Bloom Filter
+	filter[nBytes] = uint8(k)
+
+	bf.bitmap = filter
+	return bf
+}
+
+// Hash implements a hashing algorithm similar to the Murmur hash.
+func Hash(b []byte) uint32 {
+	const (
+		seed = 0xbc9f1d34
+		m    = 0xc6a4a793
+	)
+	h := uint32(seed) ^ uint32(len(b))*m
+	for ; len(b) >= 4; b = b[4:] {
+		h += uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+		h *= m
+		h ^= h >> 16
+	}
+	switch len(b) {
+	case 3:
+		h += uint32(b[2]) << 16
+		fallthrough
+	case 2:
+		h += uint32(b[1]) << 8
+		fallthrough
+	case 1:
+		h += uint32(b[0])
+		h *= m
+		h ^= h >> 24
+	}
+	return h
+}
diff --git a/utils/cache/cache.go b/utils/cache/cache.go
new file mode 100644
index 0000000..1ffd56a
--- /dev/null
+++ b/utils/cache/cache.go
@@ -0,0 +1,214 @@
+package cache
+
+import (
+	"container/list"
+	xxhash "github.com/cespare/xxhash/v2"
+	"sync"
+	"unsafe"
+)
+
+type Cache struct {
+	m         sync.RWMutex
+	lru       *windowLRU
+	slru      *segmentedLRU
+	door      *BloomFilter
+	c         *cmSketch
+	t         int32
+	threshold int32
+	data      map[uint64]*list.Element
+}
+
+type Options struct {
+	lruPct uint8
+}
+
+func NewCache(size int) *Cache {
+	const lruPct = 1
+	lruSz := (lruPct * size) / 100
+
+	if lruSz < 1 {
+		lruSz = 1
+	}
+
+	slruSz := int(float64(size) * ((100 - lruPct) / 100.0))
+
+	if slruSz < 1 {
+		slruSz = 1
+	}
+
+	slruO := int(0.2 * float64(slruSz))
+
+	if slruO < 1 {
+		slruO = 1
+	}
+
+	data := make(map[uint64]*list.Element, size)
+
+	return &Cache{
+		lru:  newWindowLRU(lruSz, data),
+		slru: newSLRU(data, slruO, slruSz-slruO),
+		door: newFilter(size, 0.01),
+		c:    newCmSketch(int64(size)),
+		data: data,
+	}
+
+}
+
+func (c *Cache) Set(key interface{}, value interface{}) bool {
+	c.m.Lock()
+	defer c.m.Unlock()
+	return c.set(key, value)
+}
+
+func (c *Cache) set(key, value interface{}) bool {
+	keyHash, conflictHash := c.keyToHash(key)
+
+	i := storeItem{
+		stage:    0,
+		key:      keyHash,
+		conflict: conflictHash,
+		value:    value,
+	}
+
+	eitem, evicted := c.lru.add(i)
+
+	if !evicted {
+		return true
+	}
+
+	victim := c.slru.victim()
+
+	if victim == nil {
+		c.slru.add(eitem)
+		return true
+	}
+
+	if !c.door.Allow(uint32(keyHash)) {
+		return true
+	}
+
+	vcount := c.c.Estimate(victim.key)
+	ocount := c.c.Estimate(eitem.key)
+
+	if ocount < vcount {
+		return true
+	}
+
+	c.slru.add(eitem)
+	return true
+}
+
+func (c *Cache) Get(key interface{}) (interface{}, bool) {
+	c.m.RLock()
+	defer c.m.RUnlock()
+	return c.get(key)
+}
+
+func (c *Cache) get(key interface{}) (interface{}, bool) {
+	c.t++
+	if c.t == c.threshold {
+		c.c.Reset()
+		c.door.reset()
+		c.t = 0
+	}
+
+	keyHash, conflictHash := c.keyToHash(key)
+
+	val, ok := c.data[keyHash]
+	if !ok {
+		c.c.Increment(keyHash)
+		return nil, false
+	}
+
+	item := val.Value.(*storeItem)
+
+	if item.conflict != conflictHash {
+		c.c.Increment(keyHash)
+		return nil, false
+	}
+
+	c.c.Increment(item.key)
+
+	v := item.value
+
+	if item.stage == 0 {
+		c.lru.get(val)
+	} else {
+		c.slru.get(val)
+	}
+
+	return v, true
+
+}
+
+func (c *Cache) Del(key interface{}) (interface{}, bool) {
+	c.m.Lock()
+	defer c.m.Unlock()
+	return c.del(key)
+}
+
+func (c *Cache) del(key interface{}) (interface{}, bool) {
+	keyHash, conflictHash := c.keyToHash(key)
+
+	val, ok := c.data[keyHash]
+	if !ok {
+		return 0, false
+	}
+
+	item := val.Value.(*storeItem)
+
+	if conflictHash != 0 && (conflictHash != item.conflict) {
+		return 0, false
+	}
+
+	delete(c.data, keyHash)
+	return item.conflict, true
+}
+
+func (c *Cache) keyToHash(key interface{}) (uint64, uint64) {
+	if key == nil {
+		return 0, 0
+	}
+	switch k := key.(type) {
+	case uint64:
+		return k, 0
+	case string:
+		return MemHashString(k), xxhash.Sum64String(k)
+	case []byte:
+		return MemHash(k), xxhash.Sum64(k)
+	case byte:
+		return uint64(k), 0
+	case int:
+		return uint64(k), 0
+	case int32:
+		return uint64(k), 0
+	case uint32:
+		return uint64(k), 0
+	case int64:
+		return uint64(k), 0
+	default:
+		panic("Key type not supported")
+	}
+}
+
+type stringStruct struct {
+	str unsafe.Pointer
+	len int
+}
+
+//go:noescape
+//go:linkname memhash runtime.memhash
+func memhash(p unsafe.Pointer, h, s uintptr) uintptr
+
+// MemHashString is the hash function used by go map, it utilizes available hardware instructions
+// (behaves as aeshash if aes instruction is available).
+// NOTE: The hash seed changes for every process. So, this cannot be used as a persistent hash.
+func MemHashString(str string) uint64 {
+	ss := (*stringStruct)(unsafe.Pointer(&str))
+	return uint64(memhash(ss.str, 0, uintptr(ss.len)))
+}
+
+func MemHash(data []byte) uint64 {
+	ss := (*stringStruct)(unsafe.Pointer(&data))
+	return uint64(memhash(ss.str, 0, uintptr(ss.len)))
+}
diff --git a/utils/cache/cache.s b/utils/cache/cache.s
new file mode 100644
index 0000000..e69de29
diff --git a/utils/cache/cache_test.go b/utils/cache/cache_test.go
new file mode 100644
index 0000000..68e54fe
--- /dev/null
+++ b/utils/cache/cache_test.go
@@ -0,0 +1,28 @@
+package cache
+
+import (
+	"fmt"
+	"github.com/stretchr/testify/assert"
+	"testing"
+)
+
+func TestCacheBasicCRUD(t *testing.T) {
+	cache := NewCache(5)
+	for i := 0; i < 10; i++ {
+		key := fmt.Sprintf("key%d", i)
+		val := fmt.Sprintf("val%d", i)
+		cache.Set(key, val)
+	}
+
+	for i := 0; i < 1000; i++ {
+		key := fmt.Sprintf("key%d", i)
+		val := fmt.Sprintf("val%d", i)
+		res, ok := cache.Get(key)
+		if ok {
+			assert.Equal(t, val, res)
+			continue
+		}
+		assert.Equal(t, res, nil)
+
+	}
+}
diff --git a/utils/cache/cmSketch.go b/utils/cache/cmSketch.go
new file mode 100644
index 0000000..e812139
--- /dev/null
+++ b/utils/cache/cmSketch.go
@@ -0,0 +1,118 @@
+package cache
+
+import (
+	"fmt"
+	"math/rand"
+	"time"
+)
+
+const (
+	cmDepth = 4
+)
+
+type cmSketch struct {
+	rows [cmDepth]cmRow
+	seed [cmDepth]uint64
+	mask uint64
+}
+
+func newCmSketch(numCounters int64) *cmSketch {
+	if numCounters == 0 {
+		panic("cmSketch: invalid numCounters")
+	}
+
+	numCounters = next2Power(numCounters)
+	sketch := &cmSketch{mask: uint64(numCounters - 1)}
+	source := rand.New(rand.NewSource(time.Now().UnixNano()))
+
+	for i := 0; i < cmDepth; i++ {
+		sketch.seed[i] = source.Uint64()
+		sketch.rows[i] = newCmRow(numCounters)
+	}
+
+	return sketch
+}
+
+func (s *cmSketch) Increment(hashed uint64) {
+	for i := range s.rows {
+		s.rows[i].increment((hashed ^ s.seed[i]) & s.mask)
+	}
+}
+
+func (s *cmSketch) Estimate(hashed uint64) int64 {
+	min := byte(255)
+	for i := range s.rows {
+		val := s.rows[i].get((hashed ^ s.seed[i]) & s.mask)
+		if val < min {
+			min = val
+		}
+	}
+
+	return int64(min)
+}
+
+// Reset halves all counter values.
+func (s *cmSketch) Reset() {
+	for _, r := range s.rows {
+		r.reset()
+	}
+}
+
+// Clear zeroes all counters.
+func (s *cmSketch) Clear() {
+	for _, r := range s.rows {
+		r.clear()
+	}
+}
+
+func next2Power(x int64) int64 {
+	x--
+	x |= x >> 1
+	x |= x >> 2
+	x |= x >> 4
+	x |= x >> 8
+	x |= x >> 16
+	x |= x >> 32
+	x++
+	return x
+}
+
+type cmRow []byte
+
+func newCmRow(numCounters int64) cmRow {
+	return make(cmRow, numCounters/2)
+}
+
+func (r cmRow) get(n uint64) byte {
+	return r[n/2] >> ((n & 1) * 4) & 0x0f
+}
+
+func (r cmRow) increment(n uint64) {
+	i := n / 2
+	s := (n & 1) * 4
+	v := (r[i] >> s) & 0x0f
+	if v < 15 {
+		r[i] += 1 << s
+	}
+}
+
+func (r cmRow) reset() {
+	for i := range r {
+		r[i] = (r[i] >> 1) & 0x77
+	}
+}
+
+func (r cmRow) clear() {
+	for i := range r {
+		r[i] = 0
+	}
+}
+
+func (r cmRow) string() string {
+	s := ""
+	for i := uint64(0); i < uint64(len(r)*2); i++ {
+		s += fmt.Sprintf("%02d ", (r[(i/2)]>>((i&1)*4))&0x0f)
+	}
+	s = s[:len(s)-1]
+	return s
+}
diff --git a/utils/cache/lru.go b/utils/cache/lru.go
new file mode 100644
index 0000000..21d0e94
--- /dev/null
+++ b/utils/cache/lru.go
@@ -0,0 +1,46 @@
+package cache
+
+import "container/list"
+
+type windowLRU struct {
+	data map[uint64]*list.Element
+	cap  int
+	list *list.List
+}
+
+type storeItem struct {
+	stage    int
+	key      uint64
+	conflict uint64
+	value    interface{}
+}
+
+func newWindowLRU(size int, data map[uint64]*list.Element) *windowLRU {
+	return &windowLRU{
+		data: data,
+		cap:  size,
+		list: list.New(),
+	}
+}
+
+func (lru *windowLRU) add(newitem storeItem) (eitem storeItem, evicted bool) {
+	if lru.list.Len() < lru.cap {
+		lru.data[newitem.key] = lru.list.PushFront(&newitem)
+		return storeItem{}, false
+	}
+
+	evictItem := lru.list.Back()
+	item := evictItem.Value.(*storeItem)
+
+	delete(lru.data, item.key)
+
+	eitem, *item = *item, newitem
+
+	lru.data[item.key] = evictItem
+	lru.list.MoveToFront(evictItem)
+	return eitem, true
+}
+
+func (lru *windowLRU) get(v *list.Element) {
+	lru.list.MoveToFront(v)
+}
diff --git a/utils/cache/s2lru.go b/utils/cache/s2lru.go
new file mode 100644
index 0000000..bd5e798
--- /dev/null
+++ b/utils/cache/s2lru.go
@@ -0,0 +1,86 @@
+package cache
+
+import "container/list"
+
+type segmentedLRU struct {
+	data                     map[uint64]*list.Element
+	stageOneCap, stageTwoCap int
+	stageOne, stageTwo       *list.List
+}
+
+const (
+	STAGE_ONE = iota
+	STAGE_TWO
+)
+
+func newSLRU(data map[uint64]*list.Element, stageOneCap, stageTwoCap int) *segmentedLRU {
+	return &segmentedLRU{
+		data:        data,
+		stageOneCap: stageOneCap,
+		stageTwoCap: stageTwoCap,
+		stageOne:    list.New(),
+		stageTwo:    list.New(),
+	}
+}
+
+func (slru *segmentedLRU) add(newitem storeItem) {
+	newitem.stage = 1
+
+	if slru.stageOne.Len() < slru.stageOneCap || slru.Len() < slru.stageOneCap+slru.stageTwoCap {
+		slru.data[newitem.key] = slru.stageOne.PushFront(&newitem)
+		return
+	}
+
+	e := slru.stageOne.Back()
+	item := e.Value.(*storeItem)
+
+	delete(slru.data, item.key)
+
+	*item = newitem
+
+	slru.data[item.key] = e
+	slru.stageOne.MoveToFront(e)
+}
+
+func (slru *segmentedLRU) get(v *list.Element) {
+	item := v.Value.(*storeItem)
+
+	if item.stage == STAGE_TWO {
+		slru.stageTwo.MoveToFront(v)
+		return
+	}
+
+	if slru.stageTwo.Len() < slru.stageTwoCap {
+		slru.stageOne.Remove(v)
+		item.stage = STAGE_TWO
+		slru.data[item.key] = slru.stageTwo.PushFront(item)
+		return
+	}
+
+	back := slru.stageTwo.Back()
+	bitem := back.Value.(*storeItem)
+
+	*bitem, *item = *item, *bitem
+
+	bitem.stage = STAGE_TWO
+	item.stage = STAGE_ONE
+
+	slru.data[item.key] = v
+	slru.data[bitem.key] = back
+
+	slru.stageOne.MoveToFront(v)
+	slru.stageTwo.MoveToFront(back)
+}
+
+func (slru *segmentedLRU) Len() int {
+	return slru.stageTwo.Len() + slru.stageOne.Len()
+}
+
+func (slru *segmentedLRU) victim() *storeItem {
+	if slru.Len() < slru.stageOneCap+slru.stageTwoCap {
+		return nil
+	}
+
+	v := slru.stageOne.Back()
+	return v.Value.(*storeItem)
+}
diff --git a/utils/closer.go b/utils/closer.go
index fe18fb9..0b9b708 100644
--- a/utils/closer.go
+++ b/utils/closer.go
@@ -1,24 +1,37 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package utils
 
 import "sync"
 
-// 用于资源回收的信号控制
+// Closer _用于资源回收的信号控制
 type Closer struct {
 	waiting     sync.WaitGroup
-	closeSignal chan struct{}
+	CloseSignal chan struct{}
 }
 
-// NewCloser
-func NewCloser(i int) *Closer {
+// NewCloser _
+func NewCloser() *Closer {
 	closer := &Closer{waiting: sync.WaitGroup{}}
-	closer.waiting.Add(i)
-	closer.closeSignal = make(chan struct{})
+	closer.CloseSignal = make(chan struct{})
 	return closer
 }
 
 // Close 上游通知下游协程进行资源回收，并等待协程通知回收完毕
 func (c *Closer) Close() {
-	close(c.closeSignal)
+	close(c.CloseSignal)
 	c.waiting.Wait()
 }
 
@@ -27,7 +40,7 @@ func (c *Closer) Done() {
 	c.waiting.Done()
 }
 
-// Wait 返回关闭信号
-func (c *Closer) Wait() chan struct{} {
-	return c.closeSignal
+// Add 添加wait 计数
+func (c *Closer) Add(n int) {
+	c.waiting.Add(n)
 }
diff --git a/utils/codec/codec.go b/utils/codec/codec.go
deleted file mode 100644
index 9df1fc8..0000000
--- a/utils/codec/codec.go
+++ /dev/null
@@ -1,10 +0,0 @@
-package codec
-
-// WalCodec 写入wal文件的编码
-func WalCodec(entry *Entry) []byte {
-	return []byte{}
-}
-
-func ValuePtrCodec(ptr *ValuePtr) []byte {
-	return []byte{}
-}
diff --git a/utils/codec/entry.go b/utils/codec/entry.go
deleted file mode 100644
index 24d28f1..0000000
--- a/utils/codec/entry.go
+++ /dev/null
@@ -1,27 +0,0 @@
-package codec
-
-import (
-	"time"
-)
-
-type Entry struct {
-	Key       []byte
-	Value     []byte
-	ExpiresAt uint64
-}
-
-func NewEntry(key, value []byte) *Entry {
-	return &Entry{
-		Key:   key,
-		Value: value,
-	}
-}
-
-func (e *Entry) WithTTL(dur time.Duration) *Entry {
-	e.ExpiresAt = uint64(time.Now().Add(dur).Unix())
-	return e
-}
-
-func (e *Entry) Size() int64 {
-	return int64(len(e.Key) + len(e.Value))
-}
diff --git a/utils/codec/value.go b/utils/codec/value.go
deleted file mode 100644
index 8dadd57..0000000
--- a/utils/codec/value.go
+++ /dev/null
@@ -1,19 +0,0 @@
-package codec
-
-type ValuePtr struct {
-}
-
-// NewValuePtr
-func NewValuePtr(entry *Entry) *ValuePtr {
-	return &ValuePtr{}
-}
-
-// IsValuePtr
-func IsValuePtr(entry *Entry) bool {
-	return false
-}
-
-// ValuePtrDecode
-func ValuePtrDecode(data []byte) *ValuePtr {
-	return nil
-}
diff --git a/utils/const.go b/utils/const.go
index 67e50b1..dcbbb59 100644
--- a/utils/const.go
+++ b/utils/const.go
@@ -1,6 +1,61 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package utils
 
+import (
+	"hash/crc32"
+	"math"
+	"os"
+)
+
 const (
-	MaxLevelNum           = 7
+	// MaxLevelNum _
+	MaxLevelNum = 7
+	// DefaultValueThreshold _
 	DefaultValueThreshold = 1024
 )
+
+// file
+const (
+	ManifestFilename                  = "MANIFEST"
+	ManifestRewriteFilename           = "REWRITEMANIFEST"
+	ManifestDeletionsRewriteThreshold = 10000
+	ManifestDeletionsRatio            = 10
+	DefaultFileFlag                   = os.O_RDWR | os.O_CREATE | os.O_APPEND
+	DefaultFileMode                   = 0666
+	MaxValueLogSize                   = 10 << 20
+	// This is O_DSYNC (datasync) on platforms that support it -- see file_unix.go
+	datasyncFileFlag = 0x0
+	// 基于可变长编码,其最可能的编码
+	MaxHeaderSize          = 21
+	VlogHeaderSize         = 0
+	MaxVlogFileSize uint32 = math.MaxUint32
+	Mi              int64  = 1 << 20
+	KVWriteChCapacity = 1000
+)
+
+// meta
+const (
+	BitDelete       byte = 1 << 0 // Set if the key has been deleted.
+	BitValuePointer byte = 1 << 1 // Set if the value is NOT stored directly next to key.
+)
+
+// codec
+var (
+	MagicText    = [4]byte{'H', 'A', 'R', 'D'}
+	MagicVersion = uint32(1)
+	// CastagnoliCrcTable is a CRC32 polynomial table
+	CastagnoliCrcTable = crc32.MakeTable(crc32.Castagnoli)
+)
diff --git a/utils/entry.go b/utils/entry.go
new file mode 100644
index 0000000..a3d5929
--- /dev/null
+++ b/utils/entry.go
@@ -0,0 +1,185 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import (
+	"encoding/binary"
+	"time"
+)
+
+type ValueStruct struct {
+	Meta      byte
+	Value     []byte
+	ExpiresAt uint64
+
+	Version uint64 // This field is not serialized. Only for internal usage.
+}
+
+// value只持久化具体的value值和过期时间
+func (vs *ValueStruct) EncodedSize() uint32 {
+	sz := len(vs.Value) + 1 // meta
+	enc := sizeVarint(vs.ExpiresAt)
+	return uint32(sz + enc)
+}
+
+// DecodeValue
+func (vs *ValueStruct) DecodeValue(buf []byte) {
+	vs.Meta = buf[0]
+	var sz int
+	vs.ExpiresAt, sz = binary.Uvarint(buf[1:])
+	vs.Value = buf[1+sz:]
+}
+
+//对value进行编码，并将编码后的字节写入byte
+//这里将过期时间和value的值一起编码
+func (vs *ValueStruct) EncodeValue(b []byte) uint32 {
+	b[0] = vs.Meta
+	sz := binary.PutUvarint(b[1:], vs.ExpiresAt)
+	n := copy(b[1+sz:], vs.Value)
+	return uint32(1 + sz + n)
+}
+
+func sizeVarint(x uint64) (n int) {
+	for {
+		n++
+		x >>= 7
+		if x == 0 {
+			break
+		}
+	}
+	return n
+}
+
+//Entry _ 最外层写入的结构体
+type Entry struct {
+	Key       []byte
+	Value     []byte
+	ExpiresAt uint64
+
+	Meta         byte
+	Version      uint64
+	Offset       uint32
+	Hlen         int // Length of the header.
+	ValThreshold int64
+}
+
+// NewEntry_
+func NewEntry(key, value []byte) *Entry {
+	return &Entry{
+		Key:   key,
+		Value: value,
+	}
+}
+
+// Entry_
+func (e *Entry) Entry() *Entry {
+	return e
+}
+
+func (e *Entry) IsDeletedOrExpired() bool {
+	if e.Value == nil {
+		return true
+	}
+
+	if e.ExpiresAt == 0 {
+		return false
+	}
+
+	return e.ExpiresAt <= uint64(time.Now().Unix())
+}
+
+// WithTTL _
+func (e *Entry) WithTTL(dur time.Duration) *Entry {
+	e.ExpiresAt = uint64(time.Now().Add(dur).Unix())
+	return e
+}
+
+// EncodedSize is the size of the ValueStruct when encoded
+func (e *Entry) EncodedSize() uint32 {
+	sz := len(e.Value)
+	enc := sizeVarint(uint64(e.Meta))
+	enc += sizeVarint(e.ExpiresAt)
+	return uint32(sz + enc)
+}
+
+// EstimateSize
+func (e *Entry) EstimateSize(threshold int) int {
+	// TODO: 是否考虑 user meta?
+	if len(e.Value) < threshold {
+		return len(e.Key) + len(e.Value) + 1 // Meta
+	}
+	return len(e.Key) + 12 + 1 // 12 for ValuePointer, 2 for meta.
+}
+
+// header 对象
+// header is used in value log as a header before Entry.
+type Header struct {
+	KLen      uint32
+	VLen      uint32
+	ExpiresAt uint64
+	Meta      byte
+}
+
+// +------+----------+------------+--------------+-----------+
+// | Meta | UserMeta | Key Length | Value Length | ExpiresAt |
+// +------+----------+------------+--------------+-----------+
+func (h Header) Encode(out []byte) int {
+	out[0] = h.Meta
+	index := 1
+	index += binary.PutUvarint(out[index:], uint64(h.KLen))
+	index += binary.PutUvarint(out[index:], uint64(h.VLen))
+	index += binary.PutUvarint(out[index:], h.ExpiresAt)
+	return index
+}
+
+// Decode decodes the given header from the provided byte slice.
+// Returns the number of bytes read.
+func (h *Header) Decode(buf []byte) int {
+	h.Meta = buf[0]
+	index := 1
+	klen, count := binary.Uvarint(buf[index:])
+	h.KLen = uint32(klen)
+	index += count
+	vlen, count := binary.Uvarint(buf[index:])
+	h.VLen = uint32(vlen)
+	index += count
+	h.ExpiresAt, count = binary.Uvarint(buf[index:])
+	return index + count
+}
+
+// DecodeFrom reads the header from the hashReader.
+// Returns the number of bytes read.
+func (h *Header) DecodeFrom(reader *HashReader) (int, error) {
+	var err error
+	h.Meta, err = reader.ReadByte()
+	if err != nil {
+		return 0, err
+	}
+	klen, err := binary.ReadUvarint(reader)
+	if err != nil {
+		return 0, err
+	}
+	h.KLen = uint32(klen)
+	vlen, err := binary.ReadUvarint(reader)
+	if err != nil {
+		return 0, err
+	}
+	h.VLen = uint32(vlen)
+	h.ExpiresAt, err = binary.ReadUvarint(reader)
+	if err != nil {
+		return 0, err
+	}
+	return reader.BytesRead, nil
+}
diff --git a/utils/entry_test.go b/utils/entry_test.go
new file mode 100644
index 0000000..5362a4e
--- /dev/null
+++ b/utils/entry_test.go
@@ -0,0 +1,34 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestValueStruct(t *testing.T) {
+	v := ValueStruct{
+		Value:     []byte("硬核课堂"),
+		Meta:      2,
+		ExpiresAt: 213123123123,
+	}
+	data := make([]byte, v.EncodedSize())
+	v.EncodeValue(data)
+	var vv ValueStruct
+	vv.DecodeValue(data)
+	assert.Equal(t, vv, v)
+}
diff --git a/utils/error.go b/utils/error.go
index 8c1ef19..279a50d 100644
--- a/utils/error.go
+++ b/utils/error.go
@@ -1,8 +1,131 @@
+// Copyright 2021 logicrec Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package utils
 
-// Panic 如果err 不为nil 则panic
+import (
+	"errors"
+	"fmt"
+	"os"
+	"path"
+	"path/filepath"
+	"runtime"
+	"strconv"
+	"strings"
+)
+
+var (
+	gopath = path.Join(os.Getenv("GOPATH"), "src") + "/"
+)
+
+// NotFoundKey 找不到key
+var (
+	// ErrKeyNotFound is returned when key isn't found on a txn.Get.
+	ErrKeyNotFound = errors.New("Key not found")
+	// ErrEmptyKey is returned if an empty key is passed on an update function.
+	ErrEmptyKey = errors.New("Key cannot be empty")
+	// ErrReWriteFailure reWrite failure
+	ErrReWriteFailure = errors.New("reWrite failure")
+	// ErrBadMagic bad magic
+	ErrBadMagic = errors.New("bad magic")
+	// ErrBadChecksum bad check sum
+	ErrBadChecksum = errors.New("bad check sum")
+	// ErrChecksumMismatch is returned at checksum mismatch.
+	ErrChecksumMismatch = errors.New("checksum mismatch")
+
+	ErrTruncate = errors.New("Do truncate")
+	ErrStop     = errors.New("Stop")
+
+	// compact
+	ErrFillTables = errors.New("Unable to fill tables")
+
+	ErrBlockedWrites  = errors.New("Writes are blocked, possibly due to DropAll or Close")
+	ErrTxnTooBig      = errors.New("Txn is too big to fit into one request")
+	ErrDeleteVlogFile = errors.New("Delete vlog file")
+	ErrNoRoom         = errors.New("No room for write")
+
+	// ErrInvalidRequest is returned if the user request is invalid.
+	ErrInvalidRequest = errors.New("Invalid request")
+	// ErrNoRewrite is returned if a call for value log GC doesn't result in a log file rewrite.
+	ErrNoRewrite = errors.New("Value log GC attempt didn't result in any cleanup")
+
+	// ErrRejected is returned if a value log GC is called either while another GC is running, or
+	// after DB::Close has been called.
+	ErrRejected = errors.New("Value log GC request rejected")
+)
+
+// Panic 如果err 不为nil 则panicc
 func Panic(err error) {
 	if err != nil {
 		panic(err)
 	}
 }
+
+// Panic2 _
+func Panic2(_ interface{}, err error) {
+	Panic(err)
+}
+
+// Err err
+func Err(err error) error {
+	if err != nil {
+		fmt.Printf("%s %s\n", location(2, true), err)
+	}
+	return err
+}
+
+// WarpErr err
+func WarpErr(format string, err error) error {
+	if err != nil {
+		fmt.Printf("%s %s %s", format, location(2, true), err)
+	}
+	return err
+}
+func location(deep int, fullPath bool) string {
+	_, file, line, ok := runtime.Caller(deep)
+	if !ok {
+		file = "???"
+		line = 0
+	}
+
+	if fullPath {
+		if strings.HasPrefix(file, gopath) {
+			file = file[len(gopath):]
+		}
+	} else {
+		file = filepath.Base(file)
+	}
+	return file + ":" + strconv.Itoa(line)
+}
+
+// CondPanic e
+func CondPanic(condition bool, err error) {
+	if condition {
+		Panic(err)
+	}
+}
diff --git a/utils/file.go b/utils/file.go
index 6b8a622..7a5fd8a 100644
--- a/utils/file.go
+++ b/utils/file.go
@@ -1,8 +1,127 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package utils
 
-import "strings"
+import (
+	"bytes"
+	"fmt"
+	"hash/crc32"
+	"io/ioutil"
+	"os"
+	"path"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	"github.com/pkg/errors"
+)
 
 // FID 根据file name 获取其fid
-func FID(name string) string {
-	return strings.Split(name, ".")[0]
+func FID(name string) uint64 {
+	name = path.Base(name)
+	if !strings.HasSuffix(name, ".sst") {
+		return 0
+	}
+	//	suffix := name[len(fileSuffix):]
+	name = strings.TrimSuffix(name, ".sst")
+	id, err := strconv.Atoi(name)
+	if err != nil {
+		Err(err)
+		return 0
+	}
+	return uint64(id)
+}
+
+func VlogFilePath(dirPath string, fid uint32) string {
+	return fmt.Sprintf("%s%s%05d.vlog", dirPath, string(os.PathSeparator), fid)
+}
+
+// CreateSyncedFile creates a new file (using O_EXCL), errors if it already existed.
+func CreateSyncedFile(filename string, sync bool) (*os.File, error) {
+	flags := os.O_RDWR | os.O_CREATE | os.O_EXCL
+	if sync {
+		flags |= datasyncFileFlag
+	}
+	return os.OpenFile(filename, flags, 0600)
+}
+
+// FileNameSSTable  sst 文件名
+func FileNameSSTable(dir string, id uint64) string {
+	return filepath.Join(dir, fmt.Sprintf("%05d.sst", id))
 }
+
+// openDir opens a directory for syncing.
+func openDir(path string) (*os.File, error) { return os.Open(path) }
+
+// SyncDir When you create or delete a file, you have to ensure the directory entry for the file is synced
+// in order to guarantee the file is visible (if the system crashes). (See the man page for fsync,
+// or see https://github.com/coreos/etcd/issues/6368 for an example.)
+func SyncDir(dir string) error {
+	f, err := openDir(dir)
+	if err != nil {
+		return errors.Wrapf(err, "While opening directory: %s.", dir)
+	}
+	err = f.Sync()
+	closeErr := f.Close()
+	if err != nil {
+		return errors.Wrapf(err, "While syncing directory: %s.", dir)
+	}
+	return errors.Wrapf(closeErr, "While closing directory: %s.", dir)
+}
+
+// LoadIDMap Get the id of all sst files in the current folder
+func LoadIDMap(dir string) map[uint64]struct{} {
+	fileInfos, err := ioutil.ReadDir(dir)
+	Err(err)
+	idMap := make(map[uint64]struct{})
+	for _, info := range fileInfos {
+		if info.IsDir() {
+			continue
+		}
+		fileID := FID(info.Name())
+		if fileID != 0 {
+			idMap[fileID] = struct{}{}
+		}
+	}
+	return idMap
+}
+
+// CompareKeys checks the key without timestamp and checks the timestamp if keyNoTs
+// is same.
+// a<timestamp> would be sorted higher than aa<timestamp> if we use bytes.compare
+// All keys should have timestamp.
+func CompareKeys(key1, key2 []byte) int {
+	CondPanic((len(key1) <= 8 || len(key2) <= 8), fmt.Errorf("%s,%s < 8", string(key1), string(key2)))
+	if cmp := bytes.Compare(key1[:len(key1)-8], key2[:len(key2)-8]); cmp != 0 {
+		return cmp
+	}
+	return bytes.Compare(key1[len(key1)-8:], key2[len(key2)-8:])
+}
+
+// VerifyChecksum crc32
+func VerifyChecksum(data []byte, expected []byte) error {
+	actual := uint64(crc32.Checksum(data, CastagnoliCrcTable))
+	expectedU64 := BytesToU64(expected)
+	if actual != expectedU64 {
+		return errors.Wrapf(ErrChecksumMismatch, "actual: %d, expected: %d", actual, expectedU64)
+	}
+
+	return nil
+}
+
+// CalculateChecksum _
+func CalculateChecksum(data []byte) uint64 {
+	return uint64(crc32.Checksum(data, CastagnoliCrcTable))
+}
\ No newline at end of file
diff --git a/utils/iterator.go b/utils/iterator.go
new file mode 100644
index 0000000..377ee09
--- /dev/null
+++ b/utils/iterator.go
@@ -0,0 +1,37 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+// Iterator 迭代器
+type Iterator interface {
+	Next()
+	Valid() bool
+	Rewind()
+	Item() Item
+	Close() error
+	Seek(key []byte)
+}
+
+// Item _
+type Item interface {
+	Entry() *Entry
+}
+
+// Options _
+// TODO 可能被重构
+type Options struct {
+	Prefix []byte
+	IsAsc  bool
+}
diff --git a/utils/key.go b/utils/key.go
new file mode 100644
index 0000000..c92141a
--- /dev/null
+++ b/utils/key.go
@@ -0,0 +1,90 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import (
+	"bytes"
+	"encoding/binary"
+	"math"
+	"time"
+	"unsafe"
+)
+
+type stringStruct struct {
+	str unsafe.Pointer
+	len int
+}
+
+//go:noescape
+//go:linkname memhash runtime.memhash
+func memhash(p unsafe.Pointer, h, s uintptr) uintptr
+
+// ParseKey parses the actual key from the key bytes.
+func ParseKey(key []byte) []byte {
+	if len(key) < 8 {
+		return key
+	}
+
+	return key[:len(key)-8]
+}
+
+// ParseTs parses the timestamp from the key bytes.
+func ParseTs(key []byte) uint64 {
+	if len(key) <= 8 {
+		return 0
+	}
+	return math.MaxUint64 - binary.BigEndian.Uint64(key[len(key)-8:])
+}
+
+// SameKey checks for key equality ignoring the version timestamp suffix.
+func SameKey(src, dst []byte) bool {
+	if len(src) != len(dst) {
+		return false
+	}
+	return bytes.Equal(ParseKey(src), ParseKey(dst))
+}
+
+// KeyWithTs generates a new key by appending ts to key.
+func KeyWithTs(key []byte, ts uint64) []byte {
+	out := make([]byte, len(key)+8)
+	copy(out, key)
+	binary.BigEndian.PutUint64(out[len(key):], math.MaxUint64-ts)
+	return out
+}
+
+// MemHash is the hash function used by go map, it utilizes available hardware instructions(behaves
+// as aeshash if aes instruction is available).
+// NOTE: The hash seed changes for every process. So, this cannot be used as a persistent hash.
+func MemHash(data []byte) uint64 {
+	ss := (*stringStruct)(unsafe.Pointer(&data))
+	return uint64(memhash(ss.str, 0, uintptr(ss.len)))
+}
+
+// MemHashString is the hash function used by go map, it utilizes available hardware instructions
+// (behaves as aeshash if aes instruction is available).
+// NOTE: The hash seed changes for every process. So, this cannot be used as a persistent hash.
+func MemHashString(str string) uint64 {
+	ss := (*stringStruct)(unsafe.Pointer(&str))
+	return uint64(memhash(ss.str, 0, uintptr(ss.len)))
+}
+
+// SafeCopy does append(a[:0], src...).
+func SafeCopy(a, src []byte) []byte {
+	return append(a[:0], src...)
+}
+
+func NewCurVersion() uint64 {
+	return uint64(time.Now().UnixNano() / 1e9)
+}
diff --git a/utils/map.go b/utils/map.go
index 5f9b841..92ea95e 100644
--- a/utils/map.go
+++ b/utils/map.go
@@ -1,27 +1,83 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package utils
 
-import "sync"
+import (
+	"reflect"
+	"sync"
+
+	"github.com/pkg/errors"
+)
 
+// CoreMap _
 type CoreMap struct {
 	m sync.Map
 }
 
-// NewMap
+// NewMap _
 func NewMap() *CoreMap {
 	return &CoreMap{m: sync.Map{}}
 }
 
-// Get
+// Get _
 func (c *CoreMap) Get(key interface{}) (interface{}, bool) {
-	return c.m.Load(key)
+	hashKey := c.keyToHash(key)
+	return c.m.Load(hashKey)
 }
 
-// Set
+// Set _
 func (c *CoreMap) Set(key, value interface{}) {
-	c.m.Store(key, value)
+	hashKey := c.keyToHash(key)
+	c.m.Store(hashKey, value)
+}
+
+// Del _
+func (c *CoreMap) Del(key interface{}) {
+	hashKey := c.keyToHash(key)
+	c.m.Delete(hashKey)
 }
 
-// Range
+// Range _
 func (c *CoreMap) Range(f func(key, value interface{}) bool) {
 	c.m.Range(f)
 }
+
+func (c *CoreMap) keyToHash(key interface{}) uint64 {
+	if key == nil {
+		return 0
+	}
+	switch k := key.(type) {
+	case []byte:
+		return MemHash(k)
+	case uint32:
+		return uint64(k)
+	case string:
+		return MemHashString(k)
+	case uint64:
+		return k
+	case byte:
+		return uint64(k)
+	case int:
+		return uint64(k)
+	case int32:
+		return uint64(k)
+
+	case int64:
+		return uint64(k)
+	default:
+		CondPanic(true, errors.Errorf("Key:[%+v] type not supported", reflect.TypeOf(k)))
+	}
+	return 0
+}
diff --git a/utils/mmap/darwin.go b/utils/mmap/darwin.go
new file mode 100644
index 0000000..836a81e
--- /dev/null
+++ b/utils/mmap/darwin.go
@@ -0,0 +1,61 @@
+// +build darwin
+
+/*
+ * Copyright 2019 Dgraph Labs, Inc. and Contributors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package mmap
+
+import (
+	"os"
+	"syscall"
+	"unsafe"
+
+	"golang.org/x/sys/unix"
+)
+
+// Mmap uses the mmap system call to memory-map a file. If writable is true,
+// memory protection of the pages is set so that they may be written to as well.
+func mmap(fd *os.File, writable bool, size int64) ([]byte, error) {
+	mtype := unix.PROT_READ
+	if writable {
+		mtype |= unix.PROT_WRITE
+	}
+	return unix.Mmap(int(fd.Fd()), 0, int(size), mtype, unix.MAP_SHARED)
+}
+
+// Munmap unmaps a previously mapped slice.
+func munmap(b []byte) error {
+	return unix.Munmap(b)
+}
+
+// This is required because the unix package does not support the madvise system call on OS X.
+func madvise(b []byte, readahead bool) error {
+	advice := unix.MADV_NORMAL
+	if !readahead {
+		advice = unix.MADV_RANDOM
+	}
+
+	_, _, e1 := syscall.Syscall(syscall.SYS_MADVISE, uintptr(unsafe.Pointer(&b[0])),
+		uintptr(len(b)), uintptr(advice))
+	if e1 != 0 {
+		return e1
+	}
+	return nil
+}
+
+func msync(b []byte) error {
+	return unix.Msync(b, unix.MS_SYNC)
+}
diff --git a/utils/mmap/linux.go b/utils/mmap/linux.go
new file mode 100644
index 0000000..73b3ac8
--- /dev/null
+++ b/utils/mmap/linux.go
@@ -0,0 +1,97 @@
+// +build linux
+
+// Copyright 2021 logicrec Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mmap
+
+import (
+	"os"
+	"reflect"
+	"unsafe"
+
+	"golang.org/x/sys/unix"
+)
+
+// mmap uses the mmap system call to memory-map a file. If writable is true,
+// memory protection of the pages is set so that they may be written to as well.
+func mmap(fd *os.File, writable bool, size int64) ([]byte, error) {
+	mtype := unix.PROT_READ
+	if writable {
+		mtype |= unix.PROT_WRITE
+	}
+	return unix.Mmap(int(fd.Fd()), 0, int(size), mtype, unix.MAP_SHARED)
+}
+
+// mremap is a Linux-specific system call to remap pages in memory. This can be used in place of munmap + mmap.
+func mremap(data []byte, size int) ([]byte, error) {
+	// taken from <https://github.com/torvalds/linux/blob/f8394f232b1eab649ce2df5c5f15b0e528c92091/include/uapi/linux/mman.h#L8>
+	const MREMAP_MAYMOVE = 0x1
+
+	header := (*reflect.SliceHeader)(unsafe.Pointer(&data))
+	mmapAddr, _, errno := unix.Syscall6(
+		unix.SYS_MREMAP,
+		header.Data,
+		uintptr(header.Len),
+		uintptr(size),
+		uintptr(MREMAP_MAYMOVE),
+		0,
+		0,
+	)
+	if errno != 0 {
+		return nil, errno
+	}
+
+	header.Data = mmapAddr
+	header.Cap = size
+	header.Len = size
+	return data, nil
+}
+
+// munmap unmaps a previously mapped slice.
+//
+// unix.Munmap maintains an internal list of mmapped addresses, and only calls munmap
+// if the address is present in that list. If we use mremap, this list is not updated.
+// To bypass this, we call munmap ourselves.
+func munmap(data []byte) error {
+	if len(data) == 0 || len(data) != cap(data) {
+		return unix.EINVAL
+	}
+	_, _, errno := unix.Syscall(
+		unix.SYS_MUNMAP,
+		uintptr(unsafe.Pointer(&data[0])),
+		uintptr(len(data)),
+		0,
+	)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+// madvise uses the madvise system call to give advise about the use of memory
+// when using a slice that is memory-mapped to a file. Set the readahead flag to
+// false if page references are expected in random order.
+func madvise(b []byte, readahead bool) error {
+	flags := unix.MADV_NORMAL
+	if !readahead {
+		flags = unix.MADV_RANDOM
+	}
+	return unix.Madvise(b, flags)
+}
+
+// msync writes any modified data to persistent storage.
+func msync(b []byte) error {
+	return unix.Msync(b, unix.MS_SYNC)
+}
diff --git a/utils/mmap/mmap_darwin.go b/utils/mmap/mmap_darwin.go
new file mode 100644
index 0000000..887db8d
--- /dev/null
+++ b/utils/mmap/mmap_darwin.go
@@ -0,0 +1,45 @@
+// +build darwin
+
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// mmap api
+
+// Mmap uses the mmap system call to memory-map a file. If writable is true,
+// memory protection of the pages is set so that they may be written to as well.
+package mmap
+
+import (
+	"os"
+)
+
+func Mmap(fd *os.File, writable bool, size int64) ([]byte, error) {
+	return mmap(fd, writable, size)
+}
+
+// Munmap unmaps a previously mapped slice.
+func Munmap(b []byte) error {
+	return munmap(b)
+}
+
+// Madvise uses the madvise system call to give advise about the use of memory
+// when using a slice that is memory-mapped to a file. Set the readahead flag to
+// false if page references are expected in random order.
+func Madvise(b []byte, readahead bool) error {
+	return madvise(b, readahead)
+}
+
+// Msync would call sync on the mmapped data.
+func Msync(b []byte) error {
+	return msync(b)
+}
diff --git a/utils/mmap/mmap_linux.go b/utils/mmap/mmap_linux.go
new file mode 100644
index 0000000..6c0299a
--- /dev/null
+++ b/utils/mmap/mmap_linux.go
@@ -0,0 +1,50 @@
+// +build linux
+
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// mmap api
+
+// Mmap uses the mmap system call to memory-map a file. If writable is true,
+// memory protection of the pages is set so that they may be written to as well.
+package mmap
+
+import (
+	"os"
+)
+
+func Mmap(fd *os.File, writable bool, size int64) ([]byte, error) {
+	return mmap(fd, writable, size)
+}
+
+// Munmap unmaps a previously mapped slice.
+func Munmap(b []byte) error {
+	return munmap(b)
+}
+
+// Madvise uses the madvise system call to give advise about the use of memory
+// when using a slice that is memory-mapped to a file. Set the readahead flag to
+// false if page references are expected in random order.
+func Madvise(b []byte, readahead bool) error {
+	return madvise(b, readahead)
+}
+
+// Msync would call sync on the mmapped data.
+func Msync(b []byte) error {
+	return msync(b)
+}
+
+// Mremap unmmap and mmap
+func Mremap(data []byte, size int) ([]byte, error) {
+	return mremap(data, size)
+}
diff --git a/utils/rand.go b/utils/rand.go
index 3f4ce13..c7229de 100644
--- a/utils/rand.go
+++ b/utils/rand.go
@@ -1,6 +1,7 @@
 package utils
 
 import (
+	"fmt"
 	"math/rand"
 	"sync"
 	"time"
@@ -31,3 +32,31 @@ func Float64() float64 {
 	mu.Unlock()
 	return res
 }
+
+// 生成随机字符串作为key和value
+func randStr(length int) string {
+	// 包括特殊字符,进行测试
+	str := "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ~=+%^*/()[]{}/!@#$?|©®😁😭🉑️🐂㎡硬核课堂"
+	bytes := []byte(str)
+	result := []byte{}
+	rand.Seed(time.Now().UnixNano() + int64(rand.Intn(100)))
+	for i := 0; i < length; i++ {
+		result = append(result, bytes[rand.Intn(len(bytes))])
+	}
+	return string(result)
+}
+
+// 构建entry对象
+func BuildEntry() *Entry {
+	rand.Seed(time.Now().Unix())
+	key := []byte(fmt.Sprintf("%s%s", randStr(16), "12345678"))
+	value := []byte(randStr(128))
+	// key := []byte(fmt.Sprintf("%s%s", "硬核课堂", "12345678"))
+	// value := []byte("硬核😁课堂")
+	expiresAt := uint64(time.Now().Add(12*time.Hour).UnixNano() / 1e6)
+	return &Entry{
+		Key:       key,
+		Value:     value,
+		ExpiresAt: expiresAt,
+	}
+}
diff --git a/utils/skiplist.go b/utils/skiplist.go
index 5a1153f..933cf66 100644
--- a/utils/skiplist.go
+++ b/utils/skiplist.go
@@ -1,274 +1,513 @@
+/*
+ * Copyright 2017 Dgraph Labs, Inc. and Contributors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+Adapted from RocksDB inline skiplist.
+
+Key differences:
+- No optimization for sequential inserts (no "prev").
+- No custom comparator.
+- Support overwrites. This requires care when we see the same key when inserting.
+  For RocksDB or LevelDB, overwrites are implemented as a newer sequence number in the key, so
+	there is no need for values. We don't intend to support versioning. In-place updates of values
+	would be more efficient.
+- We discard all non-concurrent code.
+- We do not support Splices. This simplifies the code a lot.
+- No AllocateNode or other pointer arithmetic.
+- We combine the findLessThan, findGreaterOrEqual, etc into one function.
+*/
+
 package utils
 
 import (
-	"bytes"
-	"github.com/hardcore-os/corekv/iterator"
-	"math/rand"
-	"sync"
-	"time"
-
-	"github.com/hardcore-os/corekv/utils/codec"
+	"github.com/pkg/errors"
+	"log"
+	"math"
+	"sync/atomic"
+	_ "unsafe"
 )
 
 const (
-	defaultMaxLevel = 48
+	maxHeight      = 20
+	heightIncrease = math.MaxUint32 / 3
 )
 
-type SkipList struct {
-	header *Element
-
-	rand *rand.Rand
-
-	maxLevel int
-	length   int
-	lock     sync.RWMutex
-	size     int64
+type node struct {
+	// Multiple parts of the value are encoded as a single uint64 so that it
+	// can be atomically loaded and stored:
+	//   value offset: uint32 (bits 0-31)
+	//   value size  : uint16 (bits 32-63)
+	value uint64
+
+	// A byte slice is 24 bytes. We are trying to save space here.
+	keyOffset uint32 // Immutable. No need to lock to access key.
+	keySize   uint16 // Immutable. No need to lock to access key.
+
+	// Height of the tower.
+	height uint16
+
+	// Most nodes do not need to use the full height of the tower, since the
+	// probability of each successive level decreases exponentially. Because
+	// these elements are never accessed, they do not need to be allocated.
+	// Therefore, when a node is allocated in the arena, its memory footprint
+	// is deliberately truncated to not include unneeded tower elements.
+	//
+	// All accesses to elements should use CAS operations, with no need to lock.
+	tower [maxHeight]uint32
 }
 
-func NewSkipList() *SkipList {
-	source := rand.NewSource(time.Now().UnixNano())
-
-	return &SkipList{
-		header: &Element{
-			levels: make([]*Element, defaultMaxLevel),
-			entry:  nil,
-			score:  0,
-		},
-		rand:     rand.New(source),
-		maxLevel: defaultMaxLevel,
-		length:   0,
-	}
+type Skiplist struct {
+	height     int32 // Current height. 1 <= height <= kMaxHeight. CAS.
+	headOffset uint32
+	ref        int32
+	arena      *Arena
+	OnClose    func()
 }
 
-type Element struct {
-	levels []*Element
-	entry  *codec.Entry
-	score  float64
+// IncrRef increases the refcount
+func (s *Skiplist) IncrRef() {
+	atomic.AddInt32(&s.ref, 1)
 }
 
-func newElement(score float64, entry *codec.Entry, level int) *Element {
-	return &Element{
-		levels: make([]*Element, level),
-		entry:  entry,
-		score:  score,
+// DecrRef decrements the refcount, deallocating the Skiplist when done using it
+func (s *Skiplist) DecrRef() {
+	newRef := atomic.AddInt32(&s.ref, -1)
+	if newRef > 0 {
+		return
+	}
+	if s.OnClose != nil {
+		s.OnClose()
 	}
-}
 
-func (elem *Element) Entry() *codec.Entry {
-	return elem.entry
+	// Indicate we are closed. Good for testing.  Also, lets GC reclaim memory. Race condition
+	// here would suggest we are accessing skiplist when we are supposed to have no reference!
+	s.arena = nil
 }
 
-func (list *SkipList) Add(data *codec.Entry) error {
-	list.lock.Lock()
-	defer list.lock.Unlock()
-	score := list.calcScore(data.Key)
-	var elem *Element
-
-	max := len(list.header.levels)
-	prevElem := list.header
-
-	var prevElemHeaders [defaultMaxLevel]*Element
-
-	for i := max - 1; i >= 0; {
-		//keep visit path here
-		prevElemHeaders[i] = prevElem
+func newNode(arena *Arena, key []byte, v ValueStruct, height int) *node {
+	// The base level is already allocated in the node struct.
+	nodeOffset := arena.putNode(height)
+	keyOffset := arena.putKey(key)
+	val := encodeValue(arena.putVal(v), v.EncodedSize())
+
+	node := arena.getNode(nodeOffset)
+	node.keyOffset = keyOffset
+	node.keySize = uint16(len(key))
+	node.height = uint16(height)
+	node.value = val
+	return node
+}
 
-		for next := prevElem.levels[i]; next != nil; next = prevElem.levels[i] {
-			if comp := list.compare(score, data.Key, next); comp <= 0 {
-				if comp == 0 {
-					elem = next
-					elem.entry = data
-					list.size += elem.Entry().Size() - data.Size()
-					return nil
-				}
+func encodeValue(valOffset uint32, valSize uint32) uint64 {
+	return uint64(valSize)<<32 | uint64(valOffset)
+}
 
-				//find the insert position
-				break
-			}
+func decodeValue(value uint64) (valOffset uint32, valSize uint32) {
+	valOffset = uint32(value)
+	valSize = uint32(value >> 32)
+	return
+}
 
-			//just like linked-list next
-			prevElem = next
-			prevElemHeaders[i] = prevElem
-		}
+// NewSkiplist makes a new empty skiplist, with a given arena size
+func NewSkiplist(arenaSize int64) *Skiplist {
+	arena := newArena(arenaSize)
+	head := newNode(arena, nil, ValueStruct{}, maxHeight)
+	ho := arena.getNodeOffset(head)
+	return &Skiplist{
+		height:     1,
+		headOffset: ho,
+		arena:      arena,
+		ref:        1,
+	}
+}
 
-		topLevel := prevElem.levels[i]
+func (s *node) getValueOffset() (uint32, uint32) {
+	value := atomic.LoadUint64(&s.value)
+	return decodeValue(value)
+}
 
-		//to skip same prevHeader's next and fill next elem into temp element
-		for i--; i >= 0 && prevElem.levels[i] == topLevel; i-- {
-			prevElemHeaders[i] = prevElem
-		}
-	}
+func (s *node) key(arena *Arena) []byte {
+	return arena.getKey(s.keyOffset, s.keySize)
+}
 
-	level := list.randLevel()
+func (s *node) setValue(arena *Arena, vo uint64) {
+	atomic.StoreUint64(&s.value, vo)
+}
 
-	elem = newElement(score, data, level)
+func (s *node) getNextOffset(h int) uint32 {
+	return atomic.LoadUint32(&s.tower[h])
+}
 
-	//to add elem to the skiplist
-	for i := 0; i < level; i++ {
-		elem.levels[i] = prevElemHeaders[i].levels[i]
-		prevElemHeaders[i].levels[i] = elem
-	}
-	list.size += data.Size()
-	list.length++
-	return nil
+func (s *node) casNextOffset(h int, old, val uint32) bool {
+	return atomic.CompareAndSwapUint32(&s.tower[h], old, val)
 }
 
-func (list *SkipList) Search(key []byte) (e *codec.Entry) {
-	list.lock.RLock()
-	defer list.lock.RUnlock()
-	if list.length == 0 {
-		return nil
+// Returns true if key is strictly > n.key.
+// If n is nil, this is an "end" marker and we return false.
+//func (s *Skiplist) keyIsAfterNode(key []byte, n *node) bool {
+//	AssertTrue(n != s.head)
+//	return n != nil && CompareKeys(key, n.key) > 0
+//}
+
+func (s *Skiplist) randomHeight() int {
+	h := 1
+	for h < maxHeight && FastRand() <= heightIncrease {
+		h++
 	}
+	return h
+}
 
-	score := list.calcScore(key)
+func (s *Skiplist) getNext(nd *node, height int) *node {
+	return s.arena.getNode(nd.getNextOffset(height))
+}
 
-	prevElem := list.header
-	i := len(list.header.levels) - 1
+func (s *Skiplist) getHead() *node {
+	return s.arena.getNode(s.headOffset)
+}
 
-	for i >= 0 {
-		for next := prevElem.levels[i]; next != nil; next = prevElem.levels[i] {
-			if comp := list.compare(score, key, next); comp <= 0 {
-				if comp == 0 {
-					return next.Entry()
-				}
-				break
+// findNear finds the node near to key.
+// If less=true, it finds rightmost node such that node.key < key (if allowEqual=false) or
+// node.key <= key (if allowEqual=true).
+// If less=false, it finds leftmost node such that node.key > key (if allowEqual=false) or
+// node.key >= key (if allowEqual=true).
+// Returns the node found. The bool returned is true if the node has key equal to given key.
+func (s *Skiplist) findNear(key []byte, less bool, allowEqual bool) (*node, bool) {
+	x := s.getHead()
+	level := int(s.getHeight() - 1)
+	for {
+		// Assume x.key < key.
+		next := s.getNext(x, level)
+		if next == nil {
+			// x.key < key < END OF LIST
+			if level > 0 {
+				// Can descend further to iterate closer to the end.
+				level--
+				continue
 			}
-
-			prevElem = next
+			// Level=0. Cannot descend further. Let's return something that makes sense.
+			if !less {
+				return nil, false
+			}
+			// Try to return x. Make sure it is not a head node.
+			if x == s.getHead() {
+				return nil, false
+			}
+			return x, false
 		}
 
-		topLevel := prevElem.levels[i]
-
-		for i--; i >= 0 && prevElem.levels[i] == topLevel; i-- {
+		nextKey := next.key(s.arena)
+		cmp := CompareKeys(key, nextKey)
+		if cmp > 0 {
+			// x.key < next.key < key. We can continue to move right.
+			x = next
+			continue
+		}
+		if cmp == 0 {
+			// x.key < key == next.key.
+			if allowEqual {
+				return next, true
+			}
+			if !less {
+				// We want >, so go to base level to grab the next bigger note.
+				return s.getNext(next, 0), false
+			}
+			// We want <. If not base level, we should go closer in the next level.
+			if level > 0 {
+				level--
+				continue
+			}
+			// On base level. Return x.
+			if x == s.getHead() {
+				return nil, false
+			}
+			return x, false
+		}
+		// cmp < 0. In other words, x.key < key < next.
+		if level > 0 {
+			level--
+			continue
+		}
+		// At base level. Need to return something.
+		if !less {
+			return next, false
+		}
+		// Try to return x. Make sure it is not a head node.
+		if x == s.getHead() {
+			return nil, false
+		}
+		return x, false
+	}
+}
 
+// findSpliceForLevel returns (outBefore, outAfter) with outBefore.key <= key <= outAfter.key.
+// The input "before" tells us where to start looking.
+// If we found a node with the same key, then we return outBefore = outAfter.
+// Otherwise, outBefore.key < key < outAfter.key.
+func (s *Skiplist) findSpliceForLevel(key []byte, before uint32, level int) (uint32, uint32) {
+	for {
+		// Assume before.key < key.
+		beforeNode := s.arena.getNode(before)
+		next := beforeNode.getNextOffset(level)
+		nextNode := s.arena.getNode(next)
+		if nextNode == nil {
+			return before, next
+		}
+		nextKey := nextNode.key(s.arena)
+		cmp := CompareKeys(key, nextKey)
+		if cmp == 0 {
+			// Equality case.
+			return next, next
+		}
+		if cmp < 0 {
+			// before.key < key < next.key. We are done for this level.
+			return before, next
 		}
+		before = next // Keep moving right on this level.
 	}
-	return
 }
 
-/*func (list *SkipList) Remove(key []byte) error {
-	score := list.calcScore(key)
+func (s *Skiplist) getHeight() int32 {
+	return atomic.LoadInt32(&s.height)
+}
+
+// Put inserts the key-value pair.
+func (s *Skiplist) Add(e *Entry) {
+	// Since we allow overwrite, we may not need to create a new node. We might not even need to
+	// increase the height. Let's defer these actions.
+	key, v := e.Key, ValueStruct{
+		Meta:      e.Meta,
+		Value:     e.Value,
+		ExpiresAt: e.ExpiresAt,
+		Version:   e.Version,
+	}
 
-	max := len(list.header.levels)
-	prevElem := list.header
+	listHeight := s.getHeight()
+	var prev [maxHeight + 1]uint32
+	var next [maxHeight + 1]uint32
+
+	prev[listHeight] = s.headOffset
+	for i := int(listHeight) - 1; i >= 0; i-- {
+		// Use higher level to speed up for current level.
+		prev[i], next[i] = s.findSpliceForLevel(key, prev[i+1], i)
+		if prev[i] == next[i] {
+			vo := s.arena.putVal(v)
+			encValue := encodeValue(vo, v.EncodedSize())
+			prevNode := s.arena.getNode(prev[i])
+			prevNode.setValue(s.arena, encValue)
+			return
+		}
+	}
 
-	var prevElemHeaders [defaultMaxLevel]*Element
-	var elem *Element
+	// We do need to create a new node.
+	height := s.randomHeight()
+	x := newNode(s.arena, key, v, height)
 
-	for i := max - 1; i >= 0; {
-		//keep visit path here
-		prevElemHeaders[i] = prevElem
+	// Try to increase s.height via CAS.
+	listHeight = s.getHeight()
+	for height > int(listHeight) {
+		if atomic.CompareAndSwapInt32(&s.height, listHeight, int32(height)) {
+			// Successfully increased skiplist.height.
+			break
+		}
+		listHeight = s.getHeight()
+	}
 
-		for next := prevElem.levels[i]; next != nil; next = prevElem.levels[i] {
-			if comp := list.compare(score, key, next); comp <= 0 {
-				if comp == 0 {
-					elem = next
-				}
+	// We always insert from the base level and up. After you add a node in base level, we cannot
+	// create a node in the level above because it would have discovered the node in the base level.
+	for i := 0; i < height; i++ {
+		for {
+			if s.arena.getNode(prev[i]) == nil {
+				AssertTrue(i > 1) // This cannot happen in base level.
+				// We haven't computed prev, next for this level because height exceeds old listHeight.
+				// For these levels, we expect the lists to be sparse, so we can just search from head.
+				prev[i], next[i] = s.findSpliceForLevel(key, s.headOffset, i)
+				// Someone adds the exact same key before we are able to do so. This can only happen on
+				// the base level. But we know we are not on the base level.
+				AssertTrue(prev[i] != next[i])
+			}
+			x.tower[i] = next[i]
+			pnode := s.arena.getNode(prev[i])
+			if pnode.casNextOffset(i, next[i], s.arena.getNodeOffset(x)) {
+				// Managed to insert x between prev[i] and next[i]. Go to the next level.
 				break
 			}
-
-			//just like linked-list next
-			prevElem = next
-			prevElemHeaders[i] = prevElem
+			// CAS failed. We need to recompute prev and next.
+			// It is unlikely to be helpful to try to use a different level as we redo the search,
+			// because it is unlikely that lots of nodes are inserted between prev[i] and next[i].
+			prev[i], next[i] = s.findSpliceForLevel(key, prev[i], i)
+			if prev[i] == next[i] {
+				AssertTruef(i == 0, "Equality can happen only on base level: %d", i)
+				vo := s.arena.putVal(v)
+				encValue := encodeValue(vo, v.EncodedSize())
+				prevNode := s.arena.getNode(prev[i])
+				prevNode.setValue(s.arena, encValue)
+				return
+			}
 		}
+	}
+}
 
-		topLevel := prevElem.levels[i]
+// Empty returns if the Skiplist is empty.
+func (s *Skiplist) Empty() bool {
+	return s.findLast() == nil
+}
 
-		//to skip same prevHeader's next and fill next elem into temp element
-		for i--; i >= 0 && prevElem.levels[i] == topLevel; i-- {
-			prevElemHeaders[i] = prevElem
+// findLast returns the last element. If head (empty list), we return nil. All the find functions
+// will NEVER return the head nodes.
+func (s *Skiplist) findLast() *node {
+	n := s.getHead()
+	level := int(s.getHeight()) - 1
+	for {
+		next := s.getNext(n, level)
+		if next != nil {
+			n = next
+			continue
 		}
+		if level == 0 {
+			if n == s.getHead() {
+				return nil
+			}
+			return n
+		}
+		level--
 	}
+}
 
-	if elem == nil {
-		return nil
+// Get gets the value associated with the key. It returns a valid value if it finds equal or earlier
+// version of the same key.
+func (s *Skiplist) Search(key []byte) ValueStruct {
+	n, _ := s.findNear(key, false, true) // findGreaterOrEqual.
+	if n == nil {
+		return ValueStruct{}
 	}
 
-	prevTopLevel := len(elem.levels)
-	for i := 0; i < prevTopLevel; i++ {
-		prevElemHeaders[i].levels[i] = elem.levels[i]
+	nextKey := s.arena.getKey(n.keyOffset, n.keySize)
+	if !SameKey(key, nextKey) {
+		return ValueStruct{}
 	}
 
-	list.length--
-	return nil
-}*/
+	valOffset, valSize := n.getValueOffset()
+	vs := s.arena.getVal(valOffset, valSize)
+	vs.ExpiresAt = ParseTs(nextKey)
+	return vs
+}
 
-func (list *SkipList) Close() error {
-	return nil
+// NewIterator returns a skiplist iterator.  You have to Close() the iterator.
+func (s *Skiplist) NewSkipListIterator() Iterator {
+	s.IncrRef()
+	return &SkipListIterator{list: s}
 }
 
-func (list *SkipList) calcScore(key []byte) (score float64) {
-	var hash uint64
-	l := len(key)
+// MemSize returns the size of the Skiplist in terms of how much memory is used within its internal
+// arena.
+func (s *Skiplist) MemSize() int64 { return s.arena.size() }
 
-	if l > 8 {
-		l = 8
-	}
+// Iterator is an iterator over skiplist object. For new objects, you just
+// need to initialize Iterator.list.
+type SkipListIterator struct {
+	list *Skiplist
+	n    *node
+}
+
+func (s *SkipListIterator) Rewind() {
+	s.SeekToFirst()
+}
 
-	for i := 0; i < l; i++ {
-		shift := uint(64 - 8 - i*8)
-		hash |= uint64(key[i]) << shift
+func (s *SkipListIterator) Item() Item {
+	return &Entry{
+		Key:       s.Key(),
+		Value:     s.Value().Value,
+		ExpiresAt: s.Value().ExpiresAt,
+		Meta:      s.Value().Meta,
+		Version:   s.Value().Version,
 	}
+}
 
-	score = float64(hash)
-	return
+// Close frees the resources held by the iterator
+func (s *SkipListIterator) Close() error {
+	s.list.DecrRef()
+	return nil
 }
 
-func (list *SkipList) compare(score float64, key []byte, next *Element) int {
-	if score == next.score {
-		return bytes.Compare(key, next.entry.Key)
-	}
+// Valid returns true iff the iterator is positioned at a valid node.
+func (s *SkipListIterator) Valid() bool { return s.n != nil }
 
-	if score < next.score {
-		return -1
-	} else {
-		return 1
-	}
+// Key returns the key at the current position.
+func (s *SkipListIterator) Key() []byte {
+	//implement me here
 }
 
-func (list *SkipList) randLevel() int {
-	if list.maxLevel <= 1 {
-		return 1
-	}
-	i := 1
-	for ; i < list.maxLevel; i++ {
-		if RandN(1000)%2 == 0 {
-			return i
-		}
-	}
-	return i
+// Value returns value.
+func (s *SkipListIterator) Value() ValueStruct {
+	//implement me here
 }
 
-func (list *SkipList) Size() int64 {
-	return list.size
+// ValueUint64 returns the uint64 value of the current node.
+func (s *SkipListIterator) ValueUint64() uint64 {
+	return s.n.value
 }
 
-type SkipListIter struct {
-	header *Element
-	elem   *Element
-	lock   sync.RWMutex
+// Next advances to the next position.
+func (s *SkipListIterator) Next() {
+	AssertTrue(s.Valid())
+	s.n = s.list.getNext(s.n, 0)
 }
 
-func (list *SkipList) NewSkipListIterator() iterator.Iterator {
-	return &SkipListIter{elem: list.header.levels[0], header: list.header}
+// Prev advances to the previous position.
+func (s *SkipListIterator) Prev() {
+	AssertTrue(s.Valid())
+	s.n, _ = s.list.findNear(s.Key(), true, false) // find <. No equality allowed.
 }
 
-func (iter *SkipListIter) Next() {
-	iter.lock.RLock()
-	defer iter.lock.RUnlock()
-	if iter.elem != nil {
-		iter.elem = iter.elem.levels[0]
-	}
+// 找到 >= target 的第一个节点
+func (s *SkipListIterator) Seek(target []byte) {
+	//implement me here
 }
-func (iter *SkipListIter) Valid() bool {
-	return iter.elem != nil
+
+// 找到 <= target 的第一个节点
+func (s *SkipListIterator) SeekForPrev(target []byte) {
+	//implement me here
 }
-func (iter *SkipListIter) Rewind() {
-	iter.elem = iter.header
+
+//定位到链表的第一个节点
+func (s *SkipListIterator) SeekToFirst() {
+	//implement me here
 }
-func (iter *SkipListIter) Item() iterator.Item {
-	return iter.elem
+
+// SeekToLast seeks position at the last entry in list.
+// Final state of iterator is Valid() iff list is not empty.
+func (s *SkipListIterator) SeekToLast() {
+	s.n = s.list.findLast()
 }
-func (iter *SkipListIter) Close() error {
-	return nil
+
+// UniIterator is a unidirectional memtable iterator. It is a thin wrapper around
+// Iterator. We like to keep Iterator as before, because it is more powerful and
+// we might support bidirectional iterators in the future.
+type UniIterator struct {
+	iter     *Iterator
+	reversed bool
+}
+
+// FastRand is a fast thread local random function.
+//go:linkname FastRand runtime.fastrand
+func FastRand() uint32
+
+// AssertTruef is AssertTrue with extra info.
+func AssertTruef(b bool, format string, args ...interface{}) {
+	if !b {
+		log.Fatalf("%+v", errors.Errorf(format, args...))
+	}
 }
diff --git a/utils/skiplist_test.go b/utils/skiplist_test.go
index 89014c8..481fd1a 100644
--- a/utils/skiplist_test.go
+++ b/utils/skiplist_test.go
@@ -1,12 +1,26 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package utils
 
 import (
 	"fmt"
-	"github.com/hardcore-os/corekv/utils/codec"
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
 	"sync"
 	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 )
 
 func RandString(len int) string {
@@ -18,79 +32,55 @@ func RandString(len int) string {
 	return string(bytes)
 }
 
-func TestSkipList_compare(t *testing.T) {
-	list := SkipList{
-		header:   nil,
-		rand:     nil,
-		maxLevel: 0,
-		length:   0,
-	}
-
-	byte1 := []byte("1")
-	byte2 := []byte("2")
-	entry1 := codec.NewEntry(byte1, byte1)
-
-	byte1score := list.calcScore(byte1)
-	byte2score := list.calcScore(byte2)
-
-	elem := &Element{
-		levels: nil,
-		entry:  entry1,
-		score:  byte2score,
-	}
-
-	assert.Equal(t, list.compare(byte1score, byte1, elem), -1)
-}
-
 func TestSkipListBasicCRUD(t *testing.T) {
-	list := NewSkipList()
+	list := NewSkiplist(1000)
 
 	//Put & Get
-	entry1 := codec.NewEntry([]byte("Key1"), []byte("Val1"))
-	assert.Nil(t, list.Add(entry1))
-	assert.Equal(t, entry1.Value, list.Search(entry1.Key).Value)
+	entry1 := NewEntry([]byte(RandString(10)), []byte("Val1"))
+	list.Add(entry1)
+	vs := list.Search(entry1.Key)
+	assert.Equal(t, entry1.Value, vs.Value)
 
-	entry2 := codec.NewEntry([]byte("Key2"), []byte("Val2"))
-	assert.Nil(t, list.Add(entry2))
-	assert.Equal(t, entry2.Value, list.Search(entry2.Key).Value)
+	entry2 := NewEntry([]byte(RandString(10)), []byte("Val2"))
+	list.Add(entry2)
+	vs = list.Search(entry2.Key)
+	assert.Equal(t, entry2.Value, vs.Value)
 
 	//Get a not exist entry
-	assert.Nil(t, list.Search([]byte("noexist")))
+	assert.Nil(t, list.Search([]byte(RandString(10))).Value)
 
 	//Update a entry
-	entry2_new := codec.NewEntry([]byte("Key1"), []byte("Val1+1"))
-	assert.Nil(t, list.Add(entry2_new))
+	entry2_new := NewEntry([]byte(RandString(10)), []byte("Val1+1"))
+	list.Add(entry2_new)
 	assert.Equal(t, entry2_new.Value, list.Search(entry2_new.Key).Value)
 }
 
 func Benchmark_SkipListBasicCRUD(b *testing.B) {
-	list := NewSkipList()
+	list := NewSkiplist(100000000)
 	key, val := "", ""
-	maxTime := 1000000
+	maxTime := 1000
 	for i := 0; i < maxTime; i++ {
 		//number := rand.Intn(10000)
-		key, val = fmt.Sprintf("Key%d", i), fmt.Sprintf("Val%d", i)
-		entry := codec.NewEntry([]byte(key), []byte(val))
-		res := list.Add(entry)
-		assert.Equal(b, res, nil)
+		key, val = RandString(10), fmt.Sprintf("Val%d", i)
+		entry := NewEntry([]byte(key), []byte(val))
+		list.Add(entry)
 		searchVal := list.Search([]byte(key))
 		assert.Equal(b, searchVal.Value, []byte(val))
-
 	}
 }
 
 func TestConcurrentBasic(t *testing.T) {
 	const n = 1000
-	l := NewSkipList()
+	l := NewSkiplist(100000000)
 	var wg sync.WaitGroup
 	key := func(i int) []byte {
-		return []byte(fmt.Sprintf("%05d", i))
+		return []byte(fmt.Sprintf("Keykeykey%05d", i))
 	}
 	for i := 0; i < n; i++ {
 		wg.Add(1)
 		go func(i int) {
 			defer wg.Done()
-			assert.Nil(t, l.Add(codec.NewEntry(key(i), key(i))))
+			l.Add(NewEntry(key(i), key(i)))
 		}(i)
 	}
 	wg.Wait()
@@ -101,10 +91,9 @@ func TestConcurrentBasic(t *testing.T) {
 		go func(i int) {
 			defer wg.Done()
 			v := l.Search(key(i))
-			if v != nil {
-				require.EqualValues(t, key(i), v.Value)
-				return
-			}
+			require.EqualValues(t, key(i), v.Value)
+			return
+
 			require.Nil(t, v)
 		}(i)
 	}
@@ -113,16 +102,16 @@ func TestConcurrentBasic(t *testing.T) {
 
 func Benchmark_ConcurrentBasic(b *testing.B) {
 	const n = 1000
-	l := NewSkipList()
+	l := NewSkiplist(100000000)
 	var wg sync.WaitGroup
 	key := func(i int) []byte {
-		return []byte(fmt.Sprintf("%05d", i))
+		return []byte(fmt.Sprintf("keykeykey%05d", i))
 	}
 	for i := 0; i < n; i++ {
 		wg.Add(1)
 		go func(i int) {
 			defer wg.Done()
-			assert.Nil(b, l.Add(codec.NewEntry(key(i), key(i))))
+			l.Add(NewEntry(key(i), key(i)))
 		}(i)
 	}
 	wg.Wait()
@@ -133,12 +122,32 @@ func Benchmark_ConcurrentBasic(b *testing.B) {
 		go func(i int) {
 			defer wg.Done()
 			v := l.Search(key(i))
-			if v != nil {
-				require.EqualValues(b, key(i), v.Value)
-				return
-			}
+			require.EqualValues(b, key(i), v.Value)
 			require.Nil(b, v)
 		}(i)
 	}
 	wg.Wait()
 }
+
+func TestSkipListIterator(t *testing.T) {
+	list := NewSkiplist(100000)
+
+	//Put & Get
+	entry1 := NewEntry([]byte(RandString(10)), []byte(RandString(10)))
+	list.Add(entry1)
+	assert.Equal(t, entry1.Value, list.Search(entry1.Key).Value)
+
+	entry2 := NewEntry([]byte(RandString(10)), []byte(RandString(10)))
+	list.Add(entry2)
+	assert.Equal(t, entry2.Value, list.Search(entry2.Key).Value)
+
+	//Update a entry
+	entry2_new := NewEntry([]byte(RandString(10)), []byte(RandString(10)))
+	list.Add(entry2_new)
+	assert.Equal(t, entry2_new.Value, list.Search(entry2_new.Key).Value)
+
+	iter := list.NewSkipListIterator()
+	for iter.Rewind(); iter.Valid(); iter.Next() {
+		fmt.Printf("iter key %s, value %s", iter.Item().Entry().Key, iter.Item().Entry().Value)
+	}
+}
diff --git a/utils/slice.go b/utils/slice.go
new file mode 100644
index 0000000..c453ae6
--- /dev/null
+++ b/utils/slice.go
@@ -0,0 +1,7 @@
+package utils
+
+// Slice holds a reusable buf, will reallocate if you request a larger size than ever before.
+// One problem is with n distinct sizes in random order it'll reallocate log(n) times.
+type Slice struct {
+	buf []byte
+}
diff --git a/utils/throttle.go b/utils/throttle.go
new file mode 100644
index 0000000..c311408
--- /dev/null
+++ b/utils/throttle.go
@@ -0,0 +1,85 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package utils
+
+import "sync"
+
+// Throttle allows a limited number of workers to run at a time. It also
+// provides a mechanism to check for errors encountered by workers and wait for
+// them to finish.
+type Throttle struct {
+	once      sync.Once
+	wg        sync.WaitGroup
+	ch        chan struct{}
+	errCh     chan error
+	finishErr error
+}
+
+// NewThrottle creates a new throttle with a max number of workers.
+func NewThrottle(max int) *Throttle {
+	return &Throttle{
+		ch:    make(chan struct{}, max),
+		errCh: make(chan error, max),
+	}
+}
+
+// Do should be called by workers before they start working. It blocks if there
+// are already maximum number of workers working. If it detects an error from
+// previously Done workers, it would return it.
+func (t *Throttle) Do() error {
+	for {
+		select {
+		case t.ch <- struct{}{}:
+			t.wg.Add(1)
+			return nil
+		case err := <-t.errCh:
+			if err != nil {
+				return err
+			}
+		}
+	}
+}
+
+// Done should be called by workers when they finish working. They can also
+// pass the error status of work done.
+func (t *Throttle) Done(err error) {
+	if err != nil {
+		t.errCh <- err
+	}
+	select {
+	case <-t.ch:
+	default:
+		panic("Throttle Do Done mismatch")
+	}
+	t.wg.Done()
+}
+
+// Finish waits until all workers have finished working. It would return any error passed by Done.
+// If Finish is called multiple time, it will wait for workers to finish only once(first time).
+// From next calls, it will return same error as found on first call.
+func (t *Throttle) Finish() error {
+	t.once.Do(func() {
+		t.wg.Wait()
+		close(t.ch)
+		close(t.errCh)
+		for err := range t.errCh {
+			if err != nil {
+				t.finishErr = err
+				return
+			}
+		}
+	})
+
+	return t.finishErr
+}
diff --git a/utils/tools.go b/utils/tools.go
index 8efd417..8d68d6f 100644
--- a/utils/tools.go
+++ b/utils/tools.go
@@ -1,5 +1,26 @@
+// Copyright 2021 bardcckre-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package utils
 
 func ValueSize(value []byte) int64 {
 	return 0
 }
+
+// Copy copies a byte slice and returns the copied slice.
+func Copy(a []byte) []byte {
+	b := make([]byte, len(a))
+	copy(b, a)
+	return b
+}
diff --git a/utils/value.go b/utils/value.go
new file mode 100644
index 0000000..38ab0f9
--- /dev/null
+++ b/utils/value.go
@@ -0,0 +1,159 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import (
+	"encoding/binary"
+	"reflect"
+	"time"
+	"unsafe"
+)
+
+const (
+	// size of vlog header.
+	// +----------------+------------------+
+	// | keyID(8 bytes) |  baseIV(12 bytes)|
+	// +----------------+------------------+
+	ValueLogHeaderSize = 20
+	vptrSize           = unsafe.Sizeof(ValuePtr{})
+)
+
+type ValuePtr struct {
+	Len    uint32
+	Offset uint32
+	Fid    uint32
+}
+
+func (p ValuePtr) Less(o *ValuePtr) bool {
+	if o == nil {
+		return false
+	}
+	if p.Fid != o.Fid {
+		return p.Fid < o.Fid
+	}
+	if p.Offset != o.Offset {
+		return p.Offset < o.Offset
+	}
+	return p.Len < o.Len
+}
+
+func (p ValuePtr) IsZero() bool {
+	return p.Fid == 0 && p.Offset == 0 && p.Len == 0
+}
+
+// Encode encodes Pointer into byte buffer.
+func (p ValuePtr) Encode() []byte {
+	b := make([]byte, vptrSize)
+	// Copy over the content from p to b.
+	*(*ValuePtr)(unsafe.Pointer(&b[0])) = p
+	return b
+}
+
+// Decode decodes the value pointer into the provided byte buffer.
+func (p *ValuePtr) Decode(b []byte) {
+	// Copy over data from b into p. Using *p=unsafe.pointer(...) leads to
+	copy(((*[vptrSize]byte)(unsafe.Pointer(p))[:]), b[:vptrSize])
+}
+func IsValuePtr(e *Entry) bool {
+	return e.Meta&BitValuePointer > 0
+}
+
+// BytesToU32 converts the given byte slice to uint32
+func BytesToU32(b []byte) uint32 {
+	return binary.BigEndian.Uint32(b)
+}
+
+// BytesToU64 _
+func BytesToU64(b []byte) uint64 {
+	return binary.BigEndian.Uint64(b)
+}
+
+// U32SliceToBytes converts the given Uint32 slice to byte slice
+func U32SliceToBytes(u32s []uint32) []byte {
+	if len(u32s) == 0 {
+		return nil
+	}
+	var b []byte
+	hdr := (*reflect.SliceHeader)(unsafe.Pointer(&b))
+	hdr.Len = len(u32s) * 4
+	hdr.Cap = hdr.Len
+	hdr.Data = uintptr(unsafe.Pointer(&u32s[0]))
+	return b
+}
+
+// U32ToBytes converts the given Uint32 to bytes
+func U32ToBytes(v uint32) []byte {
+	var uBuf [4]byte
+	binary.BigEndian.PutUint32(uBuf[:], v)
+	return uBuf[:]
+}
+
+// U64ToBytes converts the given Uint64 to bytes
+func U64ToBytes(v uint64) []byte {
+	var uBuf [8]byte
+	binary.BigEndian.PutUint64(uBuf[:], v)
+	return uBuf[:]
+}
+
+// BytesToU32Slice converts the given byte slice to uint32 slice
+func BytesToU32Slice(b []byte) []uint32 {
+	if len(b) == 0 {
+		return nil
+	}
+	var u32s []uint32
+	hdr := (*reflect.SliceHeader)(unsafe.Pointer(&u32s))
+	hdr.Len = len(b) / 4
+	hdr.Cap = hdr.Len
+	hdr.Data = uintptr(unsafe.Pointer(&b[0]))
+	return u32s
+}
+
+// ValuePtrCodec _
+func ValuePtrCodec(vp *ValuePtr) []byte {
+	return []byte{}
+}
+
+// RunCallback _
+func RunCallback(cb func()) {
+	if cb != nil {
+		cb()
+	}
+}
+
+func IsDeletedOrExpired(meta byte, expiresAt uint64) bool {
+	if meta&BitDelete > 0 {
+		return true
+	}
+	if expiresAt == 0 {
+		return false
+	}
+	return expiresAt <= uint64(time.Now().Unix())
+}
+
+func DiscardEntry(e, vs *Entry) bool {
+	// TODO 版本这个信息应该被弱化掉 在后面上MVCC或者多版本查询的时候再考虑
+	// if vs.Version != ParseTs(e.Key) {
+	// 	// Version not found. Discard.
+	// 	return true
+	// }
+	if IsDeletedOrExpired(vs.Meta, vs.ExpiresAt) {
+		return true
+	}
+	if (vs.Meta & BitValuePointer) == 0 {
+		// Key also stores the value in LSM. Discard.
+		return true
+	}
+	return false
+}
diff --git a/utils/wal.go b/utils/wal.go
new file mode 100644
index 0000000..a31daff
--- /dev/null
+++ b/utils/wal.go
@@ -0,0 +1,155 @@
+// Copyright 2021 logicrec Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package utils
+
+import (
+	"bytes"
+	"encoding/binary"
+	"hash"
+	"hash/crc32"
+	"io"
+)
+
+// LogEntry
+type LogEntry func(e *Entry, vp *ValuePtr) error
+
+type WalHeader struct {
+	KeyLen    uint32
+	ValueLen  uint32
+	Meta      byte
+	ExpiresAt uint64
+}
+
+const maxHeaderSize int = 21
+
+func (h WalHeader) Encode(out []byte) int {
+	index := 0
+	index = binary.PutUvarint(out[index:], uint64(h.KeyLen))
+	index += binary.PutUvarint(out[index:], uint64(h.ValueLen))
+	index += binary.PutUvarint(out[index:], uint64(h.Meta))
+	index += binary.PutUvarint(out[index:], h.ExpiresAt)
+	return index
+}
+
+func (h *WalHeader) Decode(reader *HashReader) (int, error) {
+	var err error
+
+	klen, err := binary.ReadUvarint(reader)
+	if err != nil {
+		return 0, err
+	}
+	h.KeyLen = uint32(klen)
+
+	vlen, err := binary.ReadUvarint(reader)
+	if err != nil {
+		return 0, err
+	}
+	h.ValueLen = uint32(vlen)
+
+	meta, err := binary.ReadUvarint(reader)
+	if err != nil {
+		return 0, err
+	}
+	h.Meta = byte(meta)
+	h.ExpiresAt, err = binary.ReadUvarint(reader)
+	if err != nil {
+		return 0, err
+	}
+	return reader.BytesRead, nil
+}
+
+// WalCodec 写入wal文件的编码
+// | header | key | value | crc32 |
+func WalCodec(buf *bytes.Buffer, e *Entry) int {
+	buf.Reset()
+	h := WalHeader{
+		KeyLen:    uint32(len(e.Key)),
+		ValueLen:  uint32(len(e.Value)),
+		ExpiresAt: e.ExpiresAt,
+	}
+
+	hash := crc32.New(CastagnoliCrcTable)
+	writer := io.MultiWriter(buf, hash)
+
+	// encode header.
+	var headerEnc [maxHeaderSize]byte
+	sz := h.Encode(headerEnc[:])
+	Panic2(writer.Write(headerEnc[:sz]))
+	Panic2(writer.Write(e.Key))
+	Panic2(writer.Write(e.Value))
+	// write crc32 hash.
+	var crcBuf [crc32.Size]byte
+	binary.BigEndian.PutUint32(crcBuf[:], hash.Sum32())
+	Panic2(buf.Write(crcBuf[:]))
+	// return encoded length.
+	return len(headerEnc[:sz]) + len(e.Key) + len(e.Value) + len(crcBuf)
+}
+
+// EstimateWalCodecSize 预估当前kv 写入wal文件占用的空间大小
+func EstimateWalCodecSize(e *Entry) int {
+	return len(e.Key) + len(e.Value) + 8 /* ExpiresAt uint64 */ +
+		crc32.Size + maxHeaderSize
+}
+
+type HashReader struct {
+	R         io.Reader
+	H         hash.Hash32
+	BytesRead int // Number of bytes read.
+}
+
+func NewHashReader(r io.Reader) *HashReader {
+	hash := crc32.New(CastagnoliCrcTable)
+	return &HashReader{
+		R: r,
+		H: hash,
+	}
+}
+
+// Read reads len(p) bytes from the reader. Returns the number of bytes read, error on failure.
+func (t *HashReader) Read(p []byte) (int, error) {
+	n, err := t.R.Read(p)
+	if err != nil {
+		return n, err
+	}
+	t.BytesRead += n
+	return t.H.Write(p[:n])
+}
+
+// ReadByte reads exactly one byte from the reader. Returns error on failure.
+func (t *HashReader) ReadByte() (byte, error) {
+	b := make([]byte, 1)
+	_, err := t.Read(b)
+	return b[0], err
+}
+
+// Sum32 returns the sum32 of the underlying hash.
+func (t *HashReader) Sum32() uint32 {
+	return t.H.Sum32()
+}
+
+// IsZero _
+func (e *Entry) IsZero() bool {
+	return len(e.Key) == 0
+}
+
+// LogHeaderLen _
+func (e *Entry) LogHeaderLen() int {
+	return e.Hlen
+}
+
+// LogOffset _
+func (e *Entry) LogOffset() uint32 {
+	return e.Offset
+}
diff --git a/vlog.go b/vlog.go
new file mode 100644
index 0000000..4a0a2a7
--- /dev/null
+++ b/vlog.go
@@ -0,0 +1,1270 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package corekv
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"hash/crc32"
+	"io"
+	"io/ioutil"
+	"math"
+	"math/rand"
+	"os"
+	"sort"
+	"strconv"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/hardcore-os/corekv/file"
+	"github.com/hardcore-os/corekv/utils"
+	"github.com/pkg/errors"
+)
+
+const discardStatsFlushThreshold = 100
+
+var lfDiscardStatsKey = []byte("!corekv!discard") // For storing lfDiscardStats
+
+// valueLog
+type valueLog struct {
+	dirPath string
+
+	// guards our view of which files exist, which to be deleted, how many active iterators
+	filesLock        sync.RWMutex
+	filesMap         map[uint32]*file.LogFile
+	maxFid           uint32
+	filesToBeDeleted []uint32
+	// A refcount of iterators -- when this hits zero, we can delete the filesToBeDeleted.
+	numActiveIterators int32
+
+	db                *DB
+	writableLogOffset uint32 // read by read, written by write. Must access via atomics.
+	numEntriesWritten uint32
+	opt               Options
+
+	garbageCh      chan struct{}
+	lfDiscardStats *lfDiscardStats
+}
+
+func (vlog *valueLog) newValuePtr(e *utils.Entry) (*utils.ValuePtr, error) {
+	// TODO 尝试使用对象复用，后面entry对象也应该使用
+	req := requestPool.Get().(*request)
+	req.reset()
+	req.Entries = []*utils.Entry{e}
+	req.Wg.Add(1)
+	req.IncrRef() // for db write
+	defer req.DecrRef()
+	err := vlog.write([]*request{req})
+	return req.Ptrs[0], err
+}
+func (vlog *valueLog) open(db *DB, ptr *utils.ValuePtr, replayFn utils.LogEntry) error {
+	vlog.lfDiscardStats.closer.Add(1)
+	go vlog.flushDiscardStats()
+	if err := vlog.populateFilesMap(); err != nil {
+		return err
+	}
+	// If no files are found, then create a new file.
+	if len(vlog.filesMap) == 0 {
+		_, err := vlog.createVlogFile(0)
+		return utils.WarpErr("Error while creating log file in valueLog.open", err)
+	}
+	fids := vlog.sortedFids()
+	for _, fid := range fids {
+		lf, ok := vlog.filesMap[fid]
+		utils.CondPanic(!ok, fmt.Errorf("vlog.filesMap[fid] fid not found"))
+		var err error
+		if err = lf.Open(
+			&file.Options{
+				FID:      uint64(fid),
+				FileName: vlog.fpath(fid),
+				Dir:      vlog.dirPath,
+				Path:     vlog.dirPath,
+				MaxSz:    2 * vlog.db.opt.ValueLogFileSize,
+			}); err != nil {
+			return errors.Wrapf(err, "Open existing file: %q", lf.FileName())
+		}
+		var offset uint32
+		// 从head处开始重放vlog日志，而不是从第一条日志
+		// head 相当于一个快照
+		if fid == ptr.Fid {
+			offset = ptr.Offset + ptr.Len
+		}
+		fmt.Printf("Replaying file id: %d at offset: %d\n", fid, offset)
+		now := time.Now()
+		// 重放日志
+		if err := vlog.replayLog(lf, offset, replayFn); err != nil {
+			// Log file is corrupted. Delete it.
+			if err == utils.ErrDeleteVlogFile {
+				delete(vlog.filesMap, fid)
+				// Close the fd of the file before deleting the file otherwise windows complaints.
+				if err := lf.Close(); err != nil {
+					return errors.Wrapf(err, "failed to close vlog file %s", lf.FileName())
+				}
+				path := vlog.fpath(lf.FID)
+				if err := os.Remove(path); err != nil {
+					return errors.Wrapf(err, "failed to delete empty value log file: %q", path)
+				}
+				continue
+			}
+			return err
+		}
+		fmt.Printf("Replay took: %s\n", time.Since(now))
+
+		if fid < vlog.maxFid {
+			// This file has been replayed. It can now be mmapped.
+			// For maxFid, the mmap would be done by the specially written code below.
+			if err := lf.Init(); err != nil {
+				return err
+			}
+		}
+	}
+	// Seek to the end to start writing.
+	last, ok := vlog.filesMap[vlog.maxFid]
+	utils.CondPanic(!ok, errors.New("vlog.filesMap[vlog.maxFid] not found"))
+	lastOffset, err := last.Seek(0, io.SeekEnd)
+	if err != nil {
+		return errors.Wrapf(err, fmt.Sprintf("file.Seek to end path:[%s]", last.FileName()))
+	}
+	vlog.writableLogOffset = uint32(lastOffset)
+
+	// head的设计起到check point的作用
+	vlog.db.vhead = &utils.ValuePtr{Fid: vlog.maxFid, Offset: uint32(lastOffset)}
+	if err := vlog.populateDiscardStats(); err != nil {
+		fmt.Errorf("Failed to populate discard stats: %s\n", err)
+	}
+	return nil
+}
+
+// Read reads the value log at a given location.
+// TODO: Make this read private.
+func (vlog *valueLog) read(vp *utils.ValuePtr) ([]byte, func(), error) {
+	buf, lf, err := vlog.readValueBytes(vp)
+	// log file is locked so, decide whether to lock immediately or let the caller to
+	// unlock it, after caller uses it.
+	cb := vlog.getUnlockCallback(lf)
+	if err != nil {
+		return nil, cb, err
+	}
+
+	if vlog.opt.VerifyValueChecksum {
+		hash := crc32.New(utils.CastagnoliCrcTable)
+		if _, err := hash.Write(buf[:len(buf)-crc32.Size]); err != nil {
+			utils.RunCallback(cb)
+			return nil, nil, errors.Wrapf(err, "failed to write hash for vp %+v", vp)
+		}
+		// Fetch checksum from the end of the buffer.
+		checksum := buf[len(buf)-crc32.Size:]
+		if hash.Sum32() != utils.BytesToU32(checksum) {
+			utils.RunCallback(cb)
+			return nil, nil, errors.Wrapf(utils.ErrChecksumMismatch, "value corrupted for vp: %+v", vp)
+		}
+	}
+	var h utils.Header
+	headerLen := h.Decode(buf)
+	kv := buf[headerLen:]
+	if uint32(len(kv)) < h.KLen+h.VLen {
+		fmt.Errorf("Invalid read: vp: %+v\n", vp)
+		return nil, nil, errors.Errorf("Invalid read: Len: %d read at:[%d:%d]",
+			len(kv), h.KLen, h.KLen+h.VLen)
+	}
+	return kv[h.KLen : h.KLen+h.VLen], cb, nil
+}
+
+// write 并不是并发安全的
+func (vlog *valueLog) write(reqs []*request) error {
+	//  需要检查是否能够正确写入
+	if err := vlog.validateWrites(reqs); err != nil {
+		return err
+	}
+
+	vlog.filesLock.RLock()
+	maxFid := vlog.maxFid
+	curlf := vlog.filesMap[maxFid]
+	vlog.filesLock.RUnlock()
+
+	var buf bytes.Buffer
+	flushWrites := func() error {
+		if buf.Len() == 0 {
+			return nil
+		}
+		data := buf.Bytes()
+		offset := vlog.woffset()
+		if err := curlf.Write(offset, data); err != nil {
+			return errors.Wrapf(err, "Unable to write to value log file: %q", curlf.FileName())
+		}
+		buf.Reset()
+		atomic.AddUint32(&vlog.writableLogOffset, uint32(len(data)))
+		curlf.AddSize(vlog.writableLogOffset)
+		return nil
+	}
+	toDisk := func() error {
+		if err := flushWrites(); err != nil {
+			return err
+		}
+		// 切分vlog文件
+		if vlog.woffset() > uint32(vlog.opt.ValueLogFileSize) ||
+			vlog.numEntriesWritten > vlog.opt.ValueLogMaxEntries {
+			if err := curlf.DoneWriting(vlog.woffset()); err != nil {
+				return err
+			}
+
+			newid := atomic.AddUint32(&vlog.maxFid, 1)
+			utils.CondPanic(newid <= 0, fmt.Errorf("newid has overflown uint32: %v", newid))
+			newlf, err := vlog.createVlogFile(newid)
+			if err != nil {
+				return err
+			}
+			curlf = newlf
+			atomic.AddInt32(&vlog.db.logRotates, 1)
+		}
+		return nil
+	}
+	for i := range reqs {
+		b := reqs[i]
+		b.Ptrs = b.Ptrs[:0]
+		var written int
+		for j := range b.Entries {
+			e := b.Entries[j]
+			if vlog.db.shouldWriteValueToLSM(e) {
+				b.Ptrs = append(b.Ptrs, &utils.ValuePtr{})
+				continue
+			}
+			var p utils.ValuePtr
+
+			p.Fid = curlf.FID
+			// Use the offset including buffer length so far.
+			p.Offset = vlog.woffset() + uint32(buf.Len())
+			plen, err := curlf.EncodeEntry(e, &buf, p.Offset) // Now encode the entry into buffer.
+			if err != nil {
+				return err
+			}
+			p.Len = uint32(plen)
+			b.Ptrs = append(b.Ptrs, &p)
+			written++
+
+			if buf.Len() > vlog.db.opt.ValueLogFileSize {
+				if err := flushWrites(); err != nil {
+					return err
+				}
+			}
+		}
+		vlog.numEntriesWritten += uint32(written)
+		// We write to disk here so that all entries that are part of the same transaction are
+		// written to the same vlog file.
+		writeNow :=
+			vlog.woffset()+uint32(buf.Len()) > uint32(vlog.opt.ValueLogFileSize) ||
+				vlog.numEntriesWritten > uint32(vlog.opt.ValueLogMaxEntries)
+		if writeNow {
+			if err := toDisk(); err != nil {
+				return err
+			}
+		}
+	}
+	return toDisk()
+}
+
+func (vlog *valueLog) close() error {
+	if vlog == nil || vlog.db == nil {
+		return nil
+	}
+	// close flushDiscardStats.
+	<-vlog.lfDiscardStats.closer.CloseSignal
+	var err error
+	for id, f := range vlog.filesMap {
+		f.Lock.Lock() // We won’t release the lock.
+		maxFid := vlog.maxFid
+		// TODO(ibrahim) - Do we need the following truncations on non-windows
+		// platforms? We expand the file only on windows and the vlog.woffset()
+		// should point to end of file on all other platforms.
+		if id == maxFid {
+			// truncate writable log file to correct offset.
+			if truncErr := f.Truncate(int64(vlog.woffset())); truncErr != nil && err == nil {
+				err = truncErr
+			}
+		}
+		if closeErr := f.Close(); closeErr != nil && err == nil {
+			err = closeErr
+		}
+		f.Lock.Unlock()
+	}
+	return err
+}
+
+func (vlog *valueLog) runGC(discardRatio float64, head *utils.ValuePtr) error {
+	select {
+	case vlog.garbageCh <- struct{}{}:
+		// Pick a log file for GC.
+		defer func() {
+			// 通过一个channel来控制一次仅运行一个GC任务
+			<-vlog.garbageCh
+		}()
+
+		var err error
+		files := vlog.pickLog(head)
+		if len(files) == 0 {
+			return utils.ErrNoRewrite
+		}
+		tried := make(map[uint32]bool)
+		for _, lf := range files {
+			//消重一下,防止随机策略和统计策略返回同一个fid
+			if _, done := tried[lf.FID]; done {
+				continue
+			}
+			tried[lf.FID] = true
+			if err = vlog.doRunGC(lf, discardRatio); err == nil {
+				return nil
+			}
+		}
+		return err
+	default:
+		return utils.ErrRejected
+	}
+}
+
+func (vlog *valueLog) doRunGC(lf *file.LogFile, discardRatio float64) (err error) {
+	// 退出的时候把统计的discard清空
+	defer func() {
+		if err == nil {
+			vlog.lfDiscardStats.Lock()
+			delete(vlog.lfDiscardStats.m, lf.FID)
+			vlog.lfDiscardStats.Unlock()
+		}
+	}()
+	s := &sampler{
+		lf:            lf,
+		countRatio:    0.01, // 1% of num entries.
+		sizeRatio:     0.1,  // 10% of the file as window.
+		fromBeginning: false,
+	}
+
+	if _, err = vlog.sample(s, discardRatio); err != nil {
+		return err
+	}
+
+	if err = vlog.rewrite(lf); err != nil {
+		return err
+	}
+	return nil
+}
+
+//重写
+func (vlog *valueLog) rewrite(f *file.LogFile) error {
+	vlog.filesLock.RLock()
+	maxFid := vlog.maxFid
+	vlog.filesLock.RUnlock()
+	utils.CondPanic(uint32(f.FID) >= maxFid, fmt.Errorf("fid to move: %d. Current max fid: %d", f.FID, maxFid))
+
+	wb := make([]*utils.Entry, 0, 1000)
+	var size int64
+
+	var count, moved int
+	fe := func(e *utils.Entry) error {
+		count++
+		if count%100000 == 0 {
+			fmt.Printf("Processing entry %d\n", count)
+		}
+
+		vs, err := vlog.db.lsm.Get(e.Key)
+		if err != nil {
+			return err
+		}
+		if utils.DiscardEntry(e, vs) {
+			return nil
+		}
+
+		if len(vs.Value) == 0 {
+			return errors.Errorf("Empty value: %+v", vs)
+		}
+		var vp utils.ValuePtr
+		vp.Decode(vs.Value)
+
+		if vp.Fid > f.FID {
+			return nil
+		}
+		if vp.Offset > e.Offset {
+			return nil
+		}
+		// 如果从lsm和vlog的同一个位置读取带entry则重新写回，也有可能读取到旧的
+		if vp.Fid == f.FID && vp.Offset == e.Offset {
+			moved++
+			// This new entry only contains the key, and a pointer to the value.
+			ne := new(utils.Entry)
+			ne.Meta = 0 // Remove all bits. Different keyspace doesn't need these bits.
+			ne.ExpiresAt = e.ExpiresAt
+			ne.Key = append([]byte{}, e.Key...)
+			ne.Value = append([]byte{}, e.Value...)
+			es := int64(ne.EstimateSize(vlog.db.opt.ValueLogFileSize))
+			// Consider size of value as well while considering the total size
+			// of the batch. There have been reports of high memory usage in
+			// rewrite because we don't consider the value size. See #1292.
+			es += int64(len(e.Value))
+
+			// Ensure length and size of wb is within transaction limits.
+			if int64(len(wb)+1) >= vlog.opt.MaxBatchCount ||
+				size+es >= vlog.opt.MaxBatchSize {
+				if err := vlog.db.batchSet(wb); err != nil {
+					return err
+				}
+				size = 0
+				wb = wb[:0]
+			}
+			wb = append(wb, ne)
+			size += es
+		}
+		return nil
+	}
+
+	_, err := vlog.iterate(f, 0, func(e *utils.Entry, vp *utils.ValuePtr) error {
+		return fe(e)
+	})
+	if err != nil {
+		return err
+	}
+
+	batchSize := 1024
+	var loops int
+	for i := 0; i < len(wb); {
+		loops++
+		if batchSize == 0 {
+			return utils.ErrNoRewrite
+		}
+		end := i + batchSize
+		if end > len(wb) {
+			end = len(wb)
+		}
+		if err := vlog.db.batchSet(wb[i:end]); err != nil {
+			if err == utils.ErrTxnTooBig {
+				// Decrease the batch size to half.
+				batchSize = batchSize / 2
+				continue
+			}
+			return err
+		}
+		i += batchSize
+	}
+	var deleteFileNow bool
+	// Entries written to LSM. Remove the older file now.
+	{
+		vlog.filesLock.Lock()
+		// Just a sanity-check.
+		if _, ok := vlog.filesMap[f.FID]; !ok {
+			vlog.filesLock.Unlock()
+			return errors.Errorf("Unable to find fid: %d", f.FID)
+		}
+		if vlog.iteratorCount() == 0 {
+			delete(vlog.filesMap, f.FID)
+			//deleteFileNow = true
+		} else {
+			vlog.filesToBeDeleted = append(vlog.filesToBeDeleted, f.FID)
+		}
+		vlog.filesLock.Unlock()
+	}
+
+	if deleteFileNow {
+		if err := vlog.deleteLogFile(f); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (vlog *valueLog) iteratorCount() int {
+	return int(atomic.LoadInt32(&vlog.numActiveIterators))
+}
+
+// TODO 在迭代器close时，需要调用此函数，关闭已经被判定需要移除的logfile
+func (vlog *valueLog) decrIteratorCount() error {
+	num := atomic.AddInt32(&vlog.numActiveIterators, -1)
+	if num != 0 {
+		return nil
+	}
+
+	vlog.filesLock.Lock()
+	lfs := make([]*file.LogFile, 0, len(vlog.filesToBeDeleted))
+	for _, id := range vlog.filesToBeDeleted {
+		lfs = append(lfs, vlog.filesMap[id])
+		delete(vlog.filesMap, id)
+	}
+	vlog.filesToBeDeleted = nil
+	vlog.filesLock.Unlock()
+
+	for _, lf := range lfs {
+		if err := vlog.deleteLogFile(lf); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (vlog *valueLog) deleteLogFile(lf *file.LogFile) error {
+	if lf == nil {
+		return nil
+	}
+	lf.Lock.Lock()
+	defer lf.Lock.Unlock()
+	utils.Err(lf.Close())
+	return os.Remove(lf.FileName())
+}
+
+// validateWrites  可以检查当前的req是否能写入vlog日志，一个vlog日志最大4GB
+func (vlog *valueLog) validateWrites(reqs []*request) error {
+	vlogOffset := uint64(vlog.woffset())
+	for _, req := range reqs {
+		// calculate size of the request.
+		size := estimateRequestSize(req)
+		estimatedVlogOffset := vlogOffset + size
+		if estimatedVlogOffset > uint64(utils.MaxVlogFileSize) {
+			return errors.Errorf("Request size offset %d is bigger than maximum offset %d",
+				estimatedVlogOffset, utils.MaxVlogFileSize)
+		}
+
+		if estimatedVlogOffset >= uint64(vlog.opt.ValueLogFileSize) {
+			// We'll create a new vlog file if the estimated offset is greater or equal to
+			// max vlog size. So, resetting the vlogOffset.
+			vlogOffset = 0
+			continue
+		}
+		// Estimated vlog offset will become current vlog offset if the vlog is not rotated.
+		vlogOffset = estimatedVlogOffset
+	}
+	return nil
+}
+
+// estimateRequestSize returns the size that needed to be written for the given request.
+func estimateRequestSize(req *request) uint64 {
+	size := uint64(0)
+	for _, e := range req.Entries {
+		size += uint64(utils.MaxHeaderSize + len(e.Key) + len(e.Value) + crc32.Size)
+	}
+	return size
+}
+
+// getUnlockCallback will returns a function which unlock the logfile if the logfile is mmaped.
+// otherwise, it unlock the logfile and return nil.
+func (vlog *valueLog) getUnlockCallback(lf *file.LogFile) func() {
+	if lf == nil {
+		return nil
+	}
+	return lf.Lock.RUnlock
+}
+
+// readValueBytes return vlog entry slice and read locked log file. Caller should take care of
+// logFile unlocking.
+func (vlog *valueLog) readValueBytes(vp *utils.ValuePtr) ([]byte, *file.LogFile, error) {
+	lf, err := vlog.getFileRLocked(vp)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	buf, err := lf.Read(vp)
+	return buf, lf, err
+}
+
+// Gets the logFile and acquires and RLock() for the mmap. You must call RUnlock on the file
+// (if non-nil)
+func (vlog *valueLog) getFileRLocked(vp *utils.ValuePtr) (*file.LogFile, error) {
+	vlog.filesLock.RLock()
+	defer vlog.filesLock.RUnlock()
+	ret, ok := vlog.filesMap[vp.Fid]
+	if !ok {
+		// log file has gone away, we can't do anything. Return.
+		return nil, errors.Errorf("file with ID: %d not found", vp.Fid)
+	}
+
+	// Check for valid offset if we are reading from writable log.
+	maxFid := vlog.maxFid
+	if vp.Fid == maxFid {
+		currentOffset := vlog.woffset()
+		if vp.Offset >= currentOffset {
+			return nil, errors.Errorf(
+				"Invalid value pointer offset: %d greater than current offset: %d",
+				vp.Offset, currentOffset)
+		}
+	}
+
+	ret.Lock.RLock()
+	return ret, nil
+}
+
+func (vlog *valueLog) woffset() uint32 {
+	return atomic.LoadUint32(&vlog.writableLogOffset)
+}
+
+func (vlog *valueLog) populateFilesMap() error {
+	vlog.filesMap = make(map[uint32]*file.LogFile)
+
+	files, err := ioutil.ReadDir(vlog.dirPath)
+	if err != nil {
+		return utils.WarpErr(fmt.Sprintf("Unable to open log dir. path[%s]", vlog.dirPath), err)
+	}
+
+	found := make(map[uint64]struct{})
+	for _, f := range files {
+		if !strings.HasSuffix(f.Name(), ".vlog") {
+			continue
+		}
+		fsz := len(f.Name())
+		fid, err := strconv.ParseUint(f.Name()[:fsz-5], 10, 32)
+		if err != nil {
+			return utils.WarpErr(fmt.Sprintf("Unable to parse log id. name:[%s]", f.Name()), err)
+		}
+		if _, ok := found[fid]; ok {
+			return utils.WarpErr(fmt.Sprintf("Duplicate file found. Please delete one. name:[%s]", f.Name()), err)
+		}
+		found[fid] = struct{}{}
+
+		lf := &file.LogFile{
+			FID:  uint32(fid),
+			Lock: sync.RWMutex{},
+		}
+		vlog.filesMap[uint32(fid)] = lf
+		if vlog.maxFid < uint32(fid) {
+			vlog.maxFid = uint32(fid)
+		}
+	}
+	return nil
+}
+
+func (vlog *valueLog) createVlogFile(fid uint32) (*file.LogFile, error) {
+	path := vlog.fpath(fid)
+
+	lf := &file.LogFile{
+		FID:  fid,
+		Lock: sync.RWMutex{},
+	}
+
+	var err error
+	utils.Panic2(nil, lf.Open(&file.Options{
+		FID:      uint64(fid),
+		FileName: path,
+		Dir:      vlog.dirPath,
+		Path:     vlog.dirPath,
+		MaxSz:    2 * vlog.db.opt.ValueLogFileSize,
+	}))
+
+	removeFile := func() {
+		// 如果处理出错 则直接删除文件
+		utils.Err(os.Remove(lf.FileName()))
+	}
+
+	if err = lf.Bootstrap(); err != nil {
+		removeFile()
+		return nil, err
+	}
+
+	if err = utils.SyncDir(vlog.dirPath); err != nil {
+		removeFile()
+		return nil, utils.WarpErr(fmt.Sprintf("Sync value log dir[%s]", vlog.dirPath), err)
+	}
+	vlog.filesLock.Lock()
+	vlog.filesMap[fid] = lf
+	vlog.maxFid = fid
+	// 现在header才是0
+	atomic.StoreUint32(&vlog.writableLogOffset, utils.VlogHeaderSize)
+	vlog.numEntriesWritten = 0
+	vlog.filesLock.Unlock()
+	return lf, nil
+}
+
+// sortedFids returns the file id's not pending deletion, sorted.  Assumes we have shared access to
+// filesMap.
+func (vlog *valueLog) sortedFids() []uint32 {
+	toBeDeleted := make(map[uint32]struct{})
+	for _, fid := range vlog.filesToBeDeleted {
+		toBeDeleted[fid] = struct{}{}
+	}
+	ret := make([]uint32, 0, len(vlog.filesMap))
+	for fid := range vlog.filesMap {
+		if _, ok := toBeDeleted[fid]; !ok {
+			ret = append(ret, fid)
+		}
+	}
+	sort.Slice(ret, func(i, j int) bool {
+		return ret[i] < ret[j]
+	})
+	return ret
+}
+
+func (vlog *valueLog) replayLog(lf *file.LogFile, offset uint32, replayFn utils.LogEntry) error {
+	// Alright, let's iterate now.
+	endOffset, err := vlog.iterate(lf, offset, replayFn)
+	if err != nil {
+		return errors.Wrapf(err, "Unable to replay logfile:[%s]", lf.FileName())
+	}
+	if int64(endOffset) == int64(lf.Size()) {
+		return nil
+	}
+
+	// TODO: 如果vlog日志损坏怎么办? 当前默认是截断损坏的数据
+
+	// The entire file should be truncated (i.e. it should be deleted).
+	// If fid == maxFid then it's okay to truncate the entire file since it will be
+	// used for future additions. Also, it's okay if the last file has size zero.
+	// We mmap 2*opt.ValueLogSize for the last file. See vlog.Open() function
+	// if endOffset <= vlogHeaderSize && lf.fid != vlog.maxFid {
+
+	if endOffset <= utils.VlogHeaderSize {
+		if lf.FID != vlog.maxFid {
+			return utils.ErrDeleteVlogFile
+		}
+		return lf.Bootstrap()
+	}
+
+	fmt.Printf("Truncating vlog file %s to offset: %d\n", lf.FileName(), endOffset)
+	if err := lf.Truncate(int64(endOffset)); err != nil {
+		return utils.WarpErr(
+			fmt.Sprintf("Truncation needed at offset %d. Can be done manually as well.", endOffset), err)
+	}
+	return nil
+}
+
+// iterate iterates over log file. It doesn't not allocate new memory for every kv pair.
+// Therefore, the kv pair is only valid for the duration of fn call.
+func (vlog *valueLog) iterate(lf *file.LogFile, offset uint32, fn utils.LogEntry) (uint32, error) {
+	if offset == 0 {
+		offset = utils.VlogHeaderSize
+	}
+	if int64(offset) == int64(lf.Size()) {
+		// We're at the end of the file already. No need to do anything.
+		return offset, nil
+	}
+
+	// We're not at the end of the file. Let's Seek to the offset and start reading.
+	if _, err := lf.Seek(int64(offset), io.SeekStart); err != nil {
+		return 0, errors.Wrapf(err, "Unable to seek, name:%s", lf.FileName())
+	}
+
+	reader := bufio.NewReader(lf.FD())
+	read := &safeRead{
+		k:            make([]byte, 10),
+		v:            make([]byte, 10),
+		recordOffset: offset,
+		lf:           lf,
+	}
+
+	var validEndOffset uint32 = offset
+
+loop:
+	for {
+		e, err := read.Entry(reader)
+		switch {
+		case err == io.EOF:
+			break loop
+		case err == io.ErrUnexpectedEOF || err == utils.ErrTruncate:
+			break loop
+		case err != nil:
+			return 0, err
+		case e == nil:
+			continue
+		}
+
+		var vp utils.ValuePtr
+		vp.Len = uint32(int(e.Hlen) + len(e.Key) + len(e.Value) + crc32.Size)
+		read.recordOffset += vp.Len
+
+		vp.Offset = e.Offset
+		vp.Fid = lf.FID
+		validEndOffset = read.recordOffset
+		if err := fn(e, &vp); err != nil {
+			if err == utils.ErrStop {
+				break
+			}
+			return 0, utils.WarpErr(fmt.Sprintf("Iteration function %s", lf.FileName()), err)
+		}
+	}
+	return validEndOffset, nil
+}
+
+// 这个对象用来重放日志
+type safeRead struct {
+	k            []byte
+	v            []byte
+	recordOffset uint32
+	lf           *file.LogFile
+}
+
+// Entry reads an entry from the provided reader. It also validates the checksum for every entry
+// read. Returns error on failure.
+func (r *safeRead) Entry(reader io.Reader) (*utils.Entry, error) {
+	tee := utils.NewHashReader(reader)
+	var h utils.Header
+	hlen, err := h.DecodeFrom(tee)
+	if err != nil {
+		return nil, err
+	}
+	if h.KLen > uint32(1<<16) { // Key length must be below uint16.
+		return nil, utils.ErrTruncate
+	}
+	kl := int(h.KLen)
+	if cap(r.k) < kl {
+		r.k = make([]byte, 2*kl)
+	}
+	vl := int(h.VLen)
+	if cap(r.v) < vl {
+		r.v = make([]byte, 2*vl)
+	}
+
+	e := &utils.Entry{}
+	e.Offset = r.recordOffset
+	e.Hlen = hlen
+	buf := make([]byte, h.KLen+h.VLen)
+	if _, err := io.ReadFull(tee, buf[:]); err != nil {
+		if err == io.EOF {
+			err = utils.ErrTruncate
+		}
+		return nil, err
+	}
+
+	e.Key = buf[:h.KLen]
+	e.Value = buf[h.KLen:]
+	var crcBuf [crc32.Size]byte
+	if _, err := io.ReadFull(reader, crcBuf[:]); err != nil {
+		if err == io.EOF {
+			err = utils.ErrTruncate
+		}
+		return nil, err
+	}
+	crc := utils.BytesToU32(crcBuf[:])
+	if crc != tee.Sum32() {
+		return nil, utils.ErrTruncate
+	}
+	e.Meta = h.Meta
+	e.ExpiresAt = h.ExpiresAt
+	return e, nil
+}
+
+// 统计脏数据
+func (vlog *valueLog) populateDiscardStats() error {
+	key := utils.KeyWithTs(lfDiscardStatsKey, math.MaxUint64)
+	var statsMap map[uint32]int64
+	vs, err := vlog.db.Get(key)
+	if err != nil {
+		return err
+	}
+	// Value doesn't exist.
+	if vs.Meta == 0 && len(vs.Value) == 0 {
+		return nil
+	}
+	val := vs.Value
+	// Entry is not stored in the LSM tree.
+	if utils.IsValuePtr(vs) {
+		var vp utils.ValuePtr
+		vp.Decode(val)
+		// Read entry from the value log.
+		result, cb, err := vlog.read(&vp)
+		// Copy it before we release the read lock.
+		val = utils.SafeCopy(nil, result)
+		utils.RunCallback(cb)
+		if err != nil {
+			return err
+		}
+	}
+	if len(val) == 0 {
+		return nil
+	}
+	if err := json.Unmarshal(val, &statsMap); err != nil {
+		return errors.Wrapf(err, "failed to unmarshal discard stats")
+	}
+	fmt.Printf("Value Log Discard stats: %v\n", statsMap)
+	vlog.lfDiscardStats.flushChan <- statsMap
+	return nil
+}
+
+func (vlog *valueLog) fpath(fid uint32) string {
+	return utils.VlogFilePath(vlog.dirPath, fid)
+}
+
+// initVLog
+func (db *DB) initVLog() {
+	vp, _ := db.getHead()
+	vlog := &valueLog{
+		dirPath:          db.opt.WorkDir,
+		filesToBeDeleted: make([]uint32, 0),
+		lfDiscardStats: &lfDiscardStats{
+			m:         make(map[uint32]int64),
+			closer:    utils.NewCloser(),
+			flushChan: make(chan map[uint32]int64, 16),
+		},
+	}
+	vlog.db = db
+	vlog.opt = *db.opt
+	vlog.garbageCh = make(chan struct{}, 1)
+	if err := vlog.open(db, vp, db.replayFunction()); err != nil {
+		utils.Panic(err)
+	}
+	db.vlog = vlog
+}
+
+// getHead prints all the head pointer in the DB and return the max value.
+func (db *DB) getHead() (*utils.ValuePtr, uint64) {
+	var vptr utils.ValuePtr
+	return &vptr, 0
+}
+func (db *DB) replayFunction() func(*utils.Entry, *utils.ValuePtr) error {
+	toLSM := func(k []byte, vs utils.ValueStruct) {
+		db.lsm.Set(&utils.Entry{
+			Key:       k,
+			Value:     vs.Value,
+			ExpiresAt: vs.ExpiresAt,
+			Meta:      vs.Meta,
+		})
+	}
+
+	return func(e *utils.Entry, vp *utils.ValuePtr) error { // Function for replaying.
+		nk := make([]byte, len(e.Key))
+		copy(nk, e.Key)
+		var nv []byte
+		meta := e.Meta
+		if db.shouldWriteValueToLSM(e) {
+			nv = make([]byte, len(e.Value))
+			copy(nv, e.Value)
+		} else {
+			nv = vp.Encode()
+			meta = meta | utils.BitValuePointer
+		}
+		// Update vhead. If the crash happens while replay was in progess
+		// and the head is not updated, we will end up replaying all the
+		// files starting from file zero, again.
+		db.updateHead([]*utils.ValuePtr{vp})
+
+		v := utils.ValueStruct{
+			Value:     nv,
+			Meta:      meta,
+			ExpiresAt: e.ExpiresAt,
+		}
+		// This entry is from a rewrite or via SetEntryAt(..).
+		toLSM(nk, v)
+		return nil
+	}
+}
+
+// updateHead should not be called without the db.Lock() since db.vhead is used
+// by the writer go routines and memtable flushing goroutine.
+func (db *DB) updateHead(ptrs []*utils.ValuePtr) {
+	var ptr *utils.ValuePtr
+	for i := len(ptrs) - 1; i >= 0; i-- {
+		p := ptrs[i]
+		if !p.IsZero() {
+			ptr = p
+			break
+		}
+	}
+	if ptr.IsZero() {
+		return
+	}
+
+	utils.CondPanic(ptr.Less(db.vhead), fmt.Errorf("ptr.Less(db.vhead) is true"))
+	db.vhead = ptr
+}
+
+// sync  同步一下，刷盘
+func (vlog *valueLog) sync(fid uint32) error {
+
+	vlog.filesLock.RLock()
+	maxFid := vlog.maxFid
+	// During replay it is possible to get sync call with fid less than maxFid.
+	// Because older file has already been synced, we can return from here.
+	if fid < maxFid || len(vlog.filesMap) == 0 {
+		vlog.filesLock.RUnlock()
+		return nil
+	}
+	curlf := vlog.filesMap[maxFid]
+	// Sometimes it is possible that vlog.maxFid has been increased but file creation
+	// with same id is still in progress and this function is called. In those cases
+	// entry for the file might not be present in vlog.filesMap.
+	if curlf == nil {
+		vlog.filesLock.RUnlock()
+		return nil
+	}
+	curlf.Lock.RLock()
+	vlog.filesLock.RUnlock()
+
+	err := curlf.Sync()
+	curlf.Lock.RUnlock()
+	return err
+}
+
+// Set
+func (v *valueLog) set(entry *utils.Entry) error {
+	return nil
+}
+
+func (v *valueLog) get(entry *utils.Entry) (*utils.Entry, error) {
+	// valuePtr := utils.ValuePtrDecode(entry.Value)
+	return nil, nil
+}
+
+// lfDiscardStats 记录丢弃key的数据
+// lfDiscardStats keeps track of the amount of data that could be discarded for
+// a given logfile.
+type lfDiscardStats struct {
+	sync.RWMutex
+	m                 map[uint32]int64
+	flushChan         chan map[uint32]int64
+	closer            *utils.Closer
+	updatesSinceFlush int
+}
+
+func (vlog *valueLog) flushDiscardStats() {
+	defer vlog.lfDiscardStats.closer.Done()
+
+	mergeStats := func(stats map[uint32]int64) ([]byte, error) {
+		vlog.lfDiscardStats.Lock()
+		defer vlog.lfDiscardStats.Unlock()
+		for fid, count := range stats {
+			vlog.lfDiscardStats.m[fid] += count
+			vlog.lfDiscardStats.updatesSinceFlush++
+		}
+
+		if vlog.lfDiscardStats.updatesSinceFlush > discardStatsFlushThreshold {
+			encodedDS, err := json.Marshal(vlog.lfDiscardStats.m)
+			if err != nil {
+				return nil, err
+			}
+			vlog.lfDiscardStats.updatesSinceFlush = 0
+			return encodedDS, nil
+		}
+		return nil, nil
+	}
+
+	process := func(stats map[uint32]int64) error {
+		encodedDS, err := mergeStats(stats)
+		if err != nil || encodedDS == nil {
+			return err
+		}
+
+		entries := []*utils.Entry{{
+			Key:   utils.KeyWithTs(lfDiscardStatsKey, 1),
+			Value: encodedDS,
+		}}
+		req, err := vlog.db.sendToWriteCh(entries)
+		// No special handling of ErrBlockedWrites is required as err is just logged in
+		// for loop below.
+		if err != nil {
+			return errors.Wrapf(err, "failed to push discard stats to write channel")
+		}
+		return req.Wait()
+	}
+
+	closer := vlog.lfDiscardStats.closer
+	for {
+		select {
+		case <-closer.CloseSignal:
+			// For simplicity just return without processing already present in stats in flushChan.
+			return
+		case stats := <-vlog.lfDiscardStats.flushChan:
+			if err := process(stats); err != nil {
+				utils.Err(fmt.Errorf("unable to process discardstats with error: %s", err))
+			}
+		}
+	}
+}
+
+// 请求池
+var requestPool = sync.Pool{
+	New: func() interface{} {
+		return new(request)
+	},
+}
+
+// request
+type request struct {
+	// Input values
+	Entries []*utils.Entry
+	// Output values and wait group stuff below
+	Ptrs []*utils.ValuePtr
+	Wg   sync.WaitGroup
+	Err  error
+	ref  int32
+}
+
+func (req *request) reset() {
+	req.Entries = req.Entries[:0]
+	req.Ptrs = req.Ptrs[:0]
+	req.Wg = sync.WaitGroup{}
+	req.Err = nil
+	req.ref = 0
+}
+
+// GC 部分
+// 选择需要gc的log文件
+func (vlog *valueLog) pickLog(head *utils.ValuePtr) (files []*file.LogFile) {
+	vlog.filesLock.RLock()
+	defer vlog.filesLock.RUnlock()
+	fids := vlog.sortedFids()
+	switch {
+	// 只有一个log文件那不需要进行GC了
+	case len(fids) <= 1:
+		return nil
+		// fid 是0说明是初次启动，更不需要gc了
+		// TODO 先不处理head
+		// case head.Fid == 0:
+		// 	return nil
+	}
+
+	// 创建一个候选对象
+	candidate := struct {
+		fid     uint32
+		discard int64
+	}{math.MaxUint32, 0}
+	// 加锁遍历fids，选择小于等于head fid的列表中discard统计最大的那个log文件
+	// discard 就是在compact过程中统计的可丢弃key的数量
+	vlog.lfDiscardStats.RLock()
+	for _, fid := range fids {
+		if fid >= head.Fid {
+			break
+		}
+		if vlog.lfDiscardStats.m[fid] > candidate.discard {
+			candidate.fid = fid
+			candidate.discard = vlog.lfDiscardStats.m[fid]
+		}
+	}
+	vlog.lfDiscardStats.RUnlock()
+
+	// 说明这是一个有效候选
+	if candidate.fid != math.MaxUint32 { // Found a candidate
+		files = append(files, vlog.filesMap[candidate.fid])
+	}
+
+	// 再补充一种随机选择的fid，比如应对初次执行时discard的统计不充分的情况
+	var idxHead int
+	for i, fid := range fids {
+		if fid == head.Fid {
+			idxHead = i
+			break
+		}
+	}
+	if idxHead == 0 { // Not found or first file
+		idxHead = 1 // 开始对
+	}
+	idx := rand.Intn(idxHead) // Don’t include head.Fid. We pick a random file before it.
+	if idx > 0 {
+		idx = rand.Intn(idx + 1) // Another level of rand to favor smaller fids.
+	}
+	files = append(files, vlog.filesMap[fids[idx]])
+	return files
+}
+
+//sampler 采样器
+type sampler struct {
+	lf            *file.LogFile
+	sizeRatio     float64
+	countRatio    float64
+	fromBeginning bool
+}
+
+func (vlog *valueLog) sample(samp *sampler, discardRatio float64) (*reason, error) {
+	sizePercent := samp.sizeRatio
+	countPercent := samp.countRatio
+	fileSize := samp.lf.Size()
+	// Set up the sampling winxdow sizes.
+	sizeWindow := float64(fileSize) * sizePercent
+	sizeWindowM := sizeWindow / (1 << 20) // in MBs.
+	countWindow := int(float64(vlog.opt.ValueLogMaxEntries) * countPercent)
+
+	var skipFirstM float64
+	var err error
+	// Skip data only if fromBeginning is set to false. Pick a random start point.
+	if !samp.fromBeginning {
+		// Pick a random start point for the log.
+		skipFirstM = float64(rand.Int63n(fileSize)) // Pick a random starting location.
+		skipFirstM -= sizeWindow                    // Avoid hitting EOF by moving back by window.
+		skipFirstM /= float64(utils.Mi)             // Convert to MBs.
+	}
+	var skipped float64
+
+	var r reason
+	start := time.Now()
+	var numIterations int
+	// 重放遍历vlog文件
+	_, err = vlog.iterate(samp.lf, 0, func(e *utils.Entry, vp *utils.ValuePtr) error {
+		numIterations++
+		esz := float64(vp.Len) / (1 << 20) // in MBs.
+		if skipped < skipFirstM {
+			skipped += esz
+			return nil
+		}
+		// Sample until we reach the window sizes or exceed 10 seconds.
+		if r.count > countWindow {
+			return utils.ErrStop
+		}
+		if r.total > sizeWindowM {
+			return utils.ErrStop
+		}
+		if time.Since(start) > 10*time.Second {
+			return utils.ErrStop
+		}
+		r.total += esz
+		r.count++
+
+		entry, err := vlog.db.Get(e.Key)
+		if err != nil {
+			return err
+		}
+		if utils.DiscardEntry(e, entry) {
+			r.discard += esz
+			return nil
+		}
+
+		// Value is still present in value log.
+		utils.CondPanic(len(entry.Value) <= 0, fmt.Errorf("len(entry.Value) <= 0"))
+		vp.Decode(entry.Value)
+
+		if vp.Fid > samp.lf.FID {
+			// Value is present in a later log. Discard.
+			r.discard += esz
+			return nil
+		}
+		if vp.Offset > e.Offset {
+			// Value is present in a later offset, but in the same log.
+			r.discard += esz
+			return nil
+		}
+		return nil
+	})
+
+	if err != nil {
+		return nil, err
+	}
+	fmt.Printf("Fid: %d. Skipped: %5.2fMB Num iterations: %d. Data status=%+v\n",
+		samp.lf.FID, skipped, numIterations, r)
+	// If we couldn't sample at least a 1000 KV pairs or at least 75% of the window size,
+	// and what we can discard is below the threshold, we should skip the rewrite.
+	if (r.count < countWindow && r.total < sizeWindowM*0.75) || r.discard < discardRatio*r.total {
+		fmt.Printf("Skipping GC on fid: %d", samp.lf.FID)
+		return nil, utils.ErrNoRewrite
+	}
+	return &r, nil
+}
+func (vlog *valueLog) waitOnGC(lc *utils.Closer) {
+	defer lc.Done()
+
+	<-lc.CloseSignal // Wait for lc to be closed.
+
+	// Block any GC in progress to finish, and don't allow any more writes to runGC by filling up
+	// the channel of size 1.
+	vlog.garbageCh <- struct{}{}
+}
+
+type reason struct {
+	total   float64
+	discard float64
+	count   int
+}
diff --git a/vlog/gc.go b/vlog/gc.go
deleted file mode 100644
index 96b50b6..0000000
--- a/vlog/gc.go
+++ /dev/null
@@ -1 +0,0 @@
-package vlog
diff --git a/vlog/vlog.go b/vlog/vlog.go
deleted file mode 100644
index 96c9da8..0000000
--- a/vlog/vlog.go
+++ /dev/null
@@ -1,47 +0,0 @@
-package vlog
-
-import (
-	"github.com/hardcore-os/corekv/utils"
-	"github.com/hardcore-os/corekv/utils/codec"
-)
-
-type Options struct {
-}
-
-// VLog
-type VLog struct {
-	closer *utils.Closer
-}
-
-// Close 关闭资源
-func (v *VLog) Close() error {
-	return nil
-}
-
-// NewVLog
-func NewVLog(opt *Options) *VLog {
-	v := &VLog{}
-	v.closer = utils.NewCloser(1)
-	return v
-}
-
-// StartGC
-func (v *VLog) StartGC() {
-	defer v.closer.Done()
-	for {
-		select {
-		case <-v.closer.Wait():
-		}
-		// gc logic...
-	}
-}
-
-// Set
-func (v *VLog) Set(entry *codec.Entry) error {
-	return nil
-}
-
-func (v *VLog) Get(entry *codec.Entry) (*codec.Entry, error) {
-	// valuePtr := codec.ValuePtrDecode(entry.Value)
-	return nil, nil
-}
diff --git a/vlog_test.go b/vlog_test.go
new file mode 100644
index 0000000..8dea322
--- /dev/null
+++ b/vlog_test.go
@@ -0,0 +1,158 @@
+// Copyright 2021 hardcore-os Project Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License")
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package corekv
+
+import (
+	"bytes"
+	"math/rand"
+	"os"
+	"testing"
+
+	"github.com/hardcore-os/corekv/utils"
+	"github.com/stretchr/testify/require"
+)
+
+var (
+	// 初始化opt
+	opt = &Options{
+		WorkDir:          "./work_test",
+		SSTableMaxSz:     1 << 10,
+		MemTableSize:     1 << 10,
+		ValueLogFileSize: 1 << 20,
+		ValueThreshold:   0,
+		MaxBatchCount:    10,
+		MaxBatchSize:     1 << 20,
+	}
+)
+
+func TestVlogBase(t *testing.T) {
+	// 清理目录
+	clearDir()
+	// 打开DB
+	db := Open(opt)
+	defer db.Close()
+	log := db.vlog
+	var err error
+	// 创建一个简单的kv entry对象
+	const val1 = "sampleval012345678901234567890123"
+	const val2 = "samplevalb012345678901234567890123"
+	require.True(t, int64(len(val1)) >= db.opt.ValueThreshold)
+
+	e1 := &utils.Entry{
+		Key:   []byte("samplekey"),
+		Value: []byte(val1),
+		Meta:  utils.BitValuePointer,
+	}
+	e2 := &utils.Entry{
+		Key:   []byte("samplekeyb"),
+		Value: []byte(val2),
+		Meta:  utils.BitValuePointer,
+	}
+
+	// 构建一个批量请求的request
+	b := new(request)
+	b.Entries = []*utils.Entry{e1, e2}
+
+	// 直接写入vlog中
+	log.write([]*request{b})
+	require.Len(t, b.Ptrs, 2)
+	t.Logf("Pointer written: %+v %+v\n", b.Ptrs[0], b.Ptrs[1])
+
+	// 从vlog中使用 value ptr指针中查询写入的分段vlog文件
+	buf1, lf1, err1 := log.readValueBytes(b.Ptrs[0])
+	buf2, lf2, err2 := log.readValueBytes(b.Ptrs[1])
+	require.NoError(t, err1)
+	require.NoError(t, err2)
+	// 关闭会调的锁
+	defer utils.RunCallback(log.getUnlockCallback(lf1))
+	defer utils.RunCallback((log.getUnlockCallback(lf2)))
+	e1, err = lf1.DecodeEntry(buf1, b.Ptrs[0].Offset)
+	require.NoError(t, err)
+	// 从vlog文件中通过指指针反序列化回 entry对象
+	e2, err = lf1.DecodeEntry(buf2, b.Ptrs[1].Offset)
+	require.NoError(t, err)
+
+	// 比较entry对象是否相等
+	readEntries := []utils.Entry{*e1, *e2}
+	require.EqualValues(t, []utils.Entry{
+		{
+			Key:    []byte("samplekey"),
+			Value:  []byte(val1),
+			Meta:   utils.BitValuePointer,
+			Offset: b.Ptrs[0].Offset,
+		},
+		{
+			Key:    []byte("samplekeyb"),
+			Value:  []byte(val2),
+			Meta:   utils.BitValuePointer,
+			Offset: b.Ptrs[1].Offset,
+		},
+	}, readEntries)
+}
+
+func clearDir() {
+	_, err := os.Stat(opt.WorkDir)
+	if err == nil {
+		os.RemoveAll(opt.WorkDir)
+	}
+	os.Mkdir(opt.WorkDir, os.ModePerm)
+}
+
+func TestValueGC(t *testing.T) {
+	clearDir()
+	opt.ValueLogFileSize = 1 << 20
+	kv := Open(opt)
+	defer kv.Close()
+	sz := 32 << 10
+	kvList := []*utils.Entry{}
+	for i := 0; i < 100; i++ {
+		e := newRandEntry(sz)
+		kvList = append(kvList, &utils.Entry{
+			Key:       e.Key,
+			Value:     e.Value,
+			Meta:      e.Meta,
+			ExpiresAt: e.ExpiresAt,
+		})
+		require.NoError(t, kv.Set(e))
+	}
+	kv.RunValueLogGC(0.9)
+	for _, e := range kvList {
+		item, err := kv.Get(e.Key)
+		require.NoError(t, err)
+		val := getItemValue(t, item)
+		require.NotNil(t, val)
+		require.True(t, bytes.Equal(item.Key, e.Key), "key not equal: e:%s, v:%s", e.Key, e.Key)
+		require.True(t, bytes.Equal(item.Value, e.Value), "value not equal: e:%s, v:%s", e.Value, e.Value)
+	}
+}
+
+func newRandEntry(sz int) *utils.Entry {
+	v := make([]byte, sz)
+	rand.Read(v[:rand.Intn(sz)])
+	e := utils.BuildEntry()
+	e.Value = v
+	return e
+}
+func getItemValue(t *testing.T, item *utils.Entry) (val []byte) {
+	t.Helper()
+	if item == nil {
+		return nil
+	}
+	var v []byte
+	v = append(v, item.Value...)
+	if v == nil {
+		return nil
+	}
+	return v
+}
diff --git a/work_test/00001.sst b/work_test/00001.sst
deleted file mode 100644
index ba17448..0000000
--- a/work_test/00001.sst
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-    "idx": "hello,0",
-    "data": "world"
-}
\ No newline at end of file
diff --git a/work_test/manifest b/work_test/manifest
deleted file mode 100644
index f48d061..0000000
--- a/work_test/manifest
+++ /dev/null
@@ -1 +0,0 @@
-00001.sst
\ No newline at end of file