Nova maintains per-CPU inode tables, and inode numbers are striped across the
tables (i.e., inos 0, n, 2n,... on cpu 0; inos 1, n + 1, 2n + 1, ... on cpu 1).

The inodes themselves live in a set of linked lists (one per CPU) of 2MB
blocks.  The last 8 bytes of each block points to the next block.  Pointers to
heads of these list live in PMEM block INODE_TABLE0_START and are replicated in
PMEM block INODE_TABLE1_START.  Additional space for inodes is allocated on
demand.

To allocate inodes, Nova maintains a per-cpu inuse_list in DRAM holds a RB
tree that holds ranges of unallocated inode numbers.

Signed-off-by: Steven Swanson <swan...@cs.ucsd.edu>
---
 fs/nova/inode.c | 1467 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nova/inode.h |  389 +++++++++++++++
 2 files changed, 1856 insertions(+)
 create mode 100644 fs/nova/inode.c
 create mode 100644 fs/nova/inode.h

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
new file mode 100644
index 000000000000..db001b7b5d4f
--- /dev/null
+++ b/fs/nova/inode.c
@@ -0,0 +1,1467 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Inode methods (allocate/free/read/write).
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu <jix...@cs.ucsd.edu>
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.storne...@gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/fs.h>
+#include <linux/aio.h>
+#include <linux/highuid.h>
+#include <linux/module.h>
+#include <linux/mpage.h>
+#include <linux/backing-dev.h>
+#include <linux/types.h>
+#include <linux/ratelimit.h>
+#include "nova.h"
+#include "inode.h"
+
+unsigned int blk_type_to_shift[NOVA_BLOCK_TYPE_MAX] = {12, 21, 30};
+uint32_t blk_type_to_size[NOVA_BLOCK_TYPE_MAX] = {0x1000, 0x200000, 
0x40000000};
+
+int nova_init_inode_inuse_list(struct super_block *sb)
+{
+       struct nova_sb_info *sbi = NOVA_SB(sb);
+       struct nova_range_node *range_node;
+       struct inode_map *inode_map;
+       unsigned long range_high;
+       int i;
+       int ret;
+
+       sbi->s_inodes_used_count = NOVA_NORMAL_INODE_START;
+
+       range_high = NOVA_NORMAL_INODE_START / sbi->cpus;
+       if (NOVA_NORMAL_INODE_START % sbi->cpus)
+               range_high++;
+
+       for (i = 0; i < sbi->cpus; i++) {
+               inode_map = &sbi->inode_maps[i];
+               range_node = nova_alloc_inode_node(sb);
+               if (range_node == NULL)
+                       /* FIXME: free allocated memories */
+                       return -ENOMEM;
+
+               range_node->range_low = 0;
+               range_node->range_high = range_high;
+               nova_update_range_node_checksum(range_node);
+               ret = nova_insert_inodetree(sbi, range_node, i);
+               if (ret) {
+                       nova_err(sb, "%s failed\n", __func__);
+                       nova_free_inode_node(sb, range_node);
+                       return ret;
+               }
+               inode_map->num_range_node_inode = 1;
+               inode_map->first_inode_range = range_node;
+       }
+
+       return 0;
+}
+
+static int nova_alloc_inode_table(struct super_block *sb,
+       struct nova_inode_info_header *sih, int version)
+{
+       struct nova_sb_info *sbi = NOVA_SB(sb);
+       struct inode_table *inode_table;
+       unsigned long blocknr;
+       u64 block;
+       int allocated;
+       int i;
+
+       for (i = 0; i < sbi->cpus; i++) {
+               inode_table = nova_get_inode_table(sb, version, i);
+               if (!inode_table)
+                       return -EINVAL;
+
+               /* Allocate replicate inodes from tail */
+               allocated = nova_new_log_blocks(sb, sih, &blocknr, 1,
+                               ALLOC_INIT_ZERO, i,
+                               version ? ALLOC_FROM_TAIL : ALLOC_FROM_HEAD);
+
+               nova_dbgv("%s: allocate log @ 0x%lx\n", __func__,
+                                                       blocknr);
+               if (allocated != 1 || blocknr == 0)
+                       return -ENOSPC;
+
+               block = nova_get_block_off(sb, blocknr, NOVA_BLOCK_TYPE_2M);
+               nova_memunlock_range(sb, inode_table, CACHELINE_SIZE);
+               inode_table->log_head = block;
+               nova_memlock_range(sb, inode_table, CACHELINE_SIZE);
+               nova_flush_buffer(inode_table, CACHELINE_SIZE, 0);
+       }
+
+       return 0;
+}
+
+int nova_init_inode_table(struct super_block *sb)
+{
+       struct nova_inode *pi = nova_get_inode_by_ino(sb, NOVA_INODETABLE_INO);
+       struct nova_inode_info_header sih;
+       int num_tables;
+       int ret = 0;
+       int i;
+
+       nova_memunlock_inode(sb, pi);
+       pi->i_mode = 0;
+       pi->i_uid = 0;
+       pi->i_gid = 0;
+       pi->i_links_count = cpu_to_le16(1);
+       pi->i_flags = 0;
+       pi->nova_ino = NOVA_INODETABLE_INO;
+
+       pi->i_blk_type = NOVA_BLOCK_TYPE_2M;
+       nova_memlock_inode(sb, pi);
+
+       sih.ino = NOVA_INODETABLE_INO;
+       sih.i_blk_type = NOVA_BLOCK_TYPE_2M;
+
+       num_tables = 1;
+       if (metadata_csum)
+               num_tables = 2;
+
+       for (i = 0; i < num_tables; i++) {
+               ret = nova_alloc_inode_table(sb, &sih, i);
+               if (ret)
+                       return ret;
+       }
+
+       PERSISTENT_BARRIER();
+       return ret;
+}
+
+inline int nova_insert_inodetree(struct nova_sb_info *sbi,
+       struct nova_range_node *new_node, int cpu)
+{
+       struct rb_root *tree;
+       int ret;
+
+       tree = &sbi->inode_maps[cpu].inode_inuse_tree;
+       ret = nova_insert_range_node(tree, new_node);
+       if (ret)
+               nova_dbg("ERROR: %s failed %d\n", __func__, ret);
+
+       return ret;
+}
+
+inline int nova_search_inodetree(struct nova_sb_info *sbi,
+       unsigned long ino, struct nova_range_node **ret_node)
+{
+       struct rb_root *tree;
+       unsigned long internal_ino;
+       int cpu;
+
+       cpu = ino % sbi->cpus;
+       tree = &sbi->inode_maps[cpu].inode_inuse_tree;
+       internal_ino = ino / sbi->cpus;
+       return nova_find_range_node(sbi, tree, internal_ino, ret_node);
+}
+
+/* Get the address in PMEM of an inode by inode number.  Allocate additional
+ * block to store additional inodes if necessary.
+ */
+int nova_get_inode_address(struct super_block *sb, u64 ino, int version,
+       u64 *pi_addr, int extendable, int extend_alternate)
+{
+       struct nova_sb_info *sbi = NOVA_SB(sb);
+       struct nova_inode_info_header sih;
+       struct inode_table *inode_table;
+       unsigned int data_bits;
+       unsigned int num_inodes_bits;
+       u64 curr;
+       unsigned int superpage_count;
+       u64 alternate_pi_addr = 0;
+       u64 internal_ino;
+       int cpuid;
+       int extended = 0;
+       unsigned int index;
+       unsigned int i = 0;
+       unsigned long blocknr;
+       unsigned long curr_addr;
+       int allocated;
+
+       if (ino < NOVA_NORMAL_INODE_START) {
+               *pi_addr = nova_get_reserved_inode_addr(sb, ino);
+               return 0;
+       }
+
+       sih.ino = NOVA_INODETABLE_INO;
+       sih.i_blk_type = NOVA_BLOCK_TYPE_2M;
+       data_bits = blk_type_to_shift[sih.i_blk_type];
+       num_inodes_bits = data_bits - NOVA_INODE_BITS;
+
+       cpuid = ino % sbi->cpus;
+       internal_ino = ino / sbi->cpus;
+
+       inode_table = nova_get_inode_table(sb, version, cpuid);
+       superpage_count = internal_ino >> num_inodes_bits;
+       index = internal_ino & ((1 << num_inodes_bits) - 1);
+
+       curr = inode_table->log_head;
+       if (curr == 0)
+               return -EINVAL;
+
+       for (i = 0; i < superpage_count; i++) {
+               if (curr == 0)
+                       return -EINVAL;
+
+               curr_addr = (unsigned long)nova_get_block(sb, curr);
+               /* Next page pointer in the last 8 bytes of the superpage */
+               curr_addr += nova_inode_blk_size(&sih) - 8;
+               curr = *(u64 *)(curr_addr);
+
+               if (curr == 0) {
+                       if (extendable == 0)
+                               return -EINVAL;
+
+                       extended = 1;
+
+                       allocated = nova_new_log_blocks(sb, &sih, &blocknr,
+                               1, ALLOC_INIT_ZERO, cpuid,
+                               version ? ALLOC_FROM_TAIL : ALLOC_FROM_HEAD);
+
+                       if (allocated != 1)
+                               return allocated;
+
+                       curr = nova_get_block_off(sb, blocknr,
+                                               NOVA_BLOCK_TYPE_2M);
+                       nova_memunlock_range(sb, (void *)curr_addr,
+                                               CACHELINE_SIZE);
+                       *(u64 *)(curr_addr) = curr;
+                       nova_memlock_range(sb, (void *)curr_addr,
+                                               CACHELINE_SIZE);
+                       nova_flush_buffer((void *)curr_addr,
+                                               NOVA_INODE_SIZE, 1);
+               }
+       }
+
+       /* Extend alternate inode table */
+       if (extended && extend_alternate && metadata_csum)
+               nova_get_inode_address(sb, ino, version + 1,
+                                       &alternate_pi_addr, extendable, 0);
+
+       *pi_addr = curr + index * NOVA_INODE_SIZE;
+
+       return 0;
+}
+
+int nova_get_alter_inode_address(struct super_block *sb, u64 ino,
+       u64 *alter_pi_addr)
+{
+       int ret;
+
+       if (metadata_csum == 0) {
+               nova_err(sb, "Access alter inode when replica inode 
disabled\n");
+               return 0;
+       }
+
+       if (ino < NOVA_NORMAL_INODE_START) {
+               *alter_pi_addr = nova_get_alter_reserved_inode_addr(sb, ino);
+       } else {
+               ret = nova_get_inode_address(sb, ino, 1, alter_pi_addr, 0, 0);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+int nova_delete_file_tree(struct super_block *sb,
+       struct nova_inode_info_header *sih, unsigned long start_blocknr,
+       unsigned long last_blocknr, bool delete_nvmm, bool delete_dead,
+       u64 epoch_id)
+{
+       struct nova_file_write_entry *entry;
+       struct nova_file_write_entry *entryc, entry_copy;
+       struct nova_file_write_entry *old_entry = NULL;
+       unsigned long pgoff = start_blocknr;
+       unsigned long old_pgoff = 0;
+       unsigned int num_free = 0;
+       int freed = 0;
+       void *ret;
+       timing_t delete_time;
+
+       NOVA_START_TIMING(delete_file_tree_t, delete_time);
+
+       entryc = (metadata_csum == 0) ? entry : &entry_copy;
+
+       /* Handle EOF blocks */
+       do {
+               entry = radix_tree_lookup(&sih->tree, pgoff);
+               if (entry) {
+                       ret = radix_tree_delete(&sih->tree, pgoff);
+                       BUG_ON(!ret || ret != entry);
+                       if (entry != old_entry) {
+                               if (old_entry && delete_nvmm) {
+                                       nova_free_old_entry(sb, sih,
+                                                       old_entry, old_pgoff,
+                                                       num_free, delete_dead,
+                                                       epoch_id);
+                                       freed += num_free;
+                               }
+
+                               old_entry = entry;
+                               old_pgoff = pgoff;
+                               num_free = 1;
+                       } else {
+                               num_free++;
+                       }
+                       pgoff++;
+               } else {
+                       /* We are finding a hole. Jump to the next entry. */
+                       entry = nova_find_next_entry(sb, sih, pgoff);
+                       if (!entry)
+                               break;
+
+                       if (metadata_csum == 0)
+                               entryc = entry;
+                       else if (!nova_verify_entry_csum(sb, entry, entryc))
+                               break;
+
+                       pgoff++;
+                       pgoff = pgoff > entryc->pgoff ? pgoff : entryc->pgoff;
+               }
+       } while (1);
+
+       if (old_entry && delete_nvmm) {
+               nova_free_old_entry(sb, sih, old_entry, old_pgoff,
+                                       num_free, delete_dead, epoch_id);
+               freed += num_free;
+       }
+
+       nova_dbgv("Inode %lu: delete file tree from pgoff %lu to %lu, %d blocks 
freed\n",
+                       sih->ino, start_blocknr, last_blocknr, freed);
+
+       NOVA_END_TIMING(delete_file_tree_t, delete_time);
+       return freed;
+}
+
+static int nova_free_dram_resource(struct super_block *sb,
+       struct nova_inode_info_header *sih)
+{
+       unsigned long last_blocknr;
+       int freed = 0;
+
+       if (!(S_ISREG(sih->i_mode)) && !(S_ISDIR(sih->i_mode)))
+               return 0;
+
+       if (S_ISREG(sih->i_mode)) {
+               last_blocknr = nova_get_last_blocknr(sb, sih);
+               freed = nova_delete_file_tree(sb, sih, 0,
+                                       last_blocknr, false, false, 0);
+       } else {
+               nova_delete_dir_tree(sb, sih);
+               freed = 1;
+       }
+
+       return freed;
+}
+
+static inline void check_eof_blocks(struct super_block *sb,
+       struct nova_inode *pi, struct inode *inode,
+       struct nova_inode_info_header *sih)
+{
+       if ((pi->i_flags & cpu_to_le32(NOVA_EOFBLOCKS_FL)) &&
+               (inode->i_size + sb->s_blocksize) > (sih->i_blocks
+                       << sb->s_blocksize_bits)) {
+               nova_memunlock_inode(sb, pi);
+               pi->i_flags &= cpu_to_le32(~NOVA_EOFBLOCKS_FL);
+               nova_update_inode_checksum(pi);
+               nova_update_alter_inode(sb, inode, pi);
+               nova_memlock_inode(sb, pi);
+       }
+}
+
+/*
+ * Free data blocks from inode in the range start <=> end
+ */
+static void nova_truncate_file_blocks(struct inode *inode, loff_t start,
+                                   loff_t end, u64 epoch_id)
+{
+       struct super_block *sb = inode->i_sb;
+       struct nova_inode *pi = nova_get_inode(sb, inode);
+       struct nova_inode_info *si = NOVA_I(inode);
+       struct nova_inode_info_header *sih = &si->header;
+       unsigned int data_bits = blk_type_to_shift[sih->i_blk_type];
+       unsigned long first_blocknr, last_blocknr;
+       int freed = 0;
+
+       inode->i_mtime = inode->i_ctime = current_time(inode);
+
+       nova_dbg_verbose("truncate: pi %p iblocks %lx %llx %llx %llx\n", pi,
+                        sih->i_blocks, start, end, pi->i_size);
+
+       first_blocknr = (start + (1UL << data_bits) - 1) >> data_bits;
+
+       if (end == 0)
+               return;
+       last_blocknr = (end - 1) >> data_bits;
+
+       if (first_blocknr > last_blocknr)
+               return;
+
+       freed = nova_delete_file_tree(sb, sih, first_blocknr,
+                               last_blocknr, true, false, epoch_id);
+
+       inode->i_blocks -= (freed * (1 << (data_bits -
+                               sb->s_blocksize_bits)));
+
+       sih->i_blocks = inode->i_blocks;
+       /* Check for the flag EOFBLOCKS is still valid after the set size */
+       check_eof_blocks(sb, pi, inode, sih);
+
+}
+
+/* search the radix tree to find hole or data
+ * in the specified range
+ * Input:
+ * first_blocknr: first block in the specified range
+ * last_blocknr: last_blocknr in the specified range
+ * @data_found: indicates whether data blocks were found
+ * @hole_found: indicates whether a hole was found
+ * hole: whether we are looking for a hole or data
+ */
+static int nova_lookup_hole_in_range(struct super_block *sb,
+       struct nova_inode_info_header *sih,
+       unsigned long first_blocknr, unsigned long last_blocknr,
+       int *data_found, int *hole_found, int hole)
+{
+       struct nova_file_write_entry *entry;
+       struct nova_file_write_entry *entryc, entry_copy;
+       unsigned long blocks = 0;
+       unsigned long pgoff, old_pgoff;
+
+       entryc = (metadata_csum == 0) ? entry : &entry_copy;
+
+       pgoff = first_blocknr;
+       while (pgoff <= last_blocknr) {
+               old_pgoff = pgoff;
+               entry = radix_tree_lookup(&sih->tree, pgoff);
+               if (entry) {
+                       *data_found = 1;
+                       if (!hole)
+                               goto done;
+                       pgoff++;
+               } else {
+                       *hole_found = 1;
+                       entry = nova_find_next_entry(sb, sih, pgoff);
+                       pgoff++;
+                       if (entry) {
+                               if (metadata_csum == 0)
+                                       entryc = entry;
+                               else if (!nova_verify_entry_csum(sb, entry,
+                                                               entryc))
+                                       goto done;
+
+                               pgoff = pgoff > entryc->pgoff ?
+                                       pgoff : entryc->pgoff;
+                               if (pgoff > last_blocknr)
+                                       pgoff = last_blocknr + 1;
+                       }
+               }
+
+               if (!*hole_found || !hole)
+                       blocks += pgoff - old_pgoff;
+       }
+done:
+       return blocks;
+}
+
+/* copy persistent state to struct inode */
+static int nova_read_inode(struct super_block *sb, struct inode *inode,
+       u64 pi_addr)
+{
+       struct nova_inode_info *si = NOVA_I(inode);
+       struct nova_inode *pi, fake_pi;
+       struct nova_inode_info_header *sih = &si->header;
+       int ret = -EIO;
+       unsigned long ino;
+
+       ret = nova_get_reference(sb, pi_addr, &fake_pi,
+                       (void **)&pi, sizeof(struct nova_inode));
+       if (ret) {
+               nova_dbg("%s: read pi @ 0x%llx failed\n",
+                               __func__, pi_addr);
+               goto bad_inode;
+       }
+
+       inode->i_mode = sih->i_mode;
+       i_uid_write(inode, le32_to_cpu(pi->i_uid));
+       i_gid_write(inode, le32_to_cpu(pi->i_gid));
+//     set_nlink(inode, le16_to_cpu(pi->i_links_count));
+       inode->i_generation = le32_to_cpu(pi->i_generation);
+       nova_set_inode_flags(inode, pi, le32_to_cpu(pi->i_flags));
+       ino = inode->i_ino;
+
+       /* check if the inode is active. */
+       if (inode->i_mode == 0 || pi->deleted == 1) {
+               /* this inode is deleted */
+               ret = -ESTALE;
+               goto bad_inode;
+       }
+
+       inode->i_blocks = sih->i_blocks;
+       inode->i_mapping->a_ops = &nova_aops_dax;
+
+       switch (inode->i_mode & S_IFMT) {
+       case S_IFREG:
+               inode->i_op = &nova_file_inode_operations;
+               if (inplace_data_updates && wprotect == 0)
+                       inode->i_fop = &nova_dax_file_operations;
+               else
+                       inode->i_fop = &nova_wrap_file_operations;
+               break;
+       case S_IFDIR:
+               inode->i_op = &nova_dir_inode_operations;
+               inode->i_fop = &nova_dir_operations;
+               break;
+       case S_IFLNK:
+               inode->i_op = &nova_symlink_inode_operations;
+               break;
+       default:
+               inode->i_op = &nova_special_inode_operations;
+               init_special_inode(inode, inode->i_mode,
+                                  le32_to_cpu(pi->dev.rdev));
+               break;
+       }
+
+       /* Update size and time after rebuild the tree */
+       inode->i_size = le64_to_cpu(sih->i_size);
+       inode->i_atime.tv_sec = (__s32)le32_to_cpu(pi->i_atime);
+       inode->i_ctime.tv_sec = (__s32)le32_to_cpu(pi->i_ctime);
+       inode->i_mtime.tv_sec = (__s32)le32_to_cpu(pi->i_mtime);
+       inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec =
+                                        inode->i_ctime.tv_nsec = 0;
+       set_nlink(inode, le16_to_cpu(pi->i_links_count));
+       return 0;
+
+bad_inode:
+       make_bad_inode(inode);
+       return ret;
+}
+
+static void nova_get_inode_flags(struct inode *inode, struct nova_inode *pi)
+{
+       unsigned int flags = inode->i_flags;
+       unsigned int nova_flags = le32_to_cpu(pi->i_flags);
+
+       nova_flags &= ~(FS_SYNC_FL | FS_APPEND_FL | FS_IMMUTABLE_FL |
+                        FS_NOATIME_FL | FS_DIRSYNC_FL);
+       if (flags & S_SYNC)
+               nova_flags |= FS_SYNC_FL;
+       if (flags & S_APPEND)
+               nova_flags |= FS_APPEND_FL;
+       if (flags & S_IMMUTABLE)
+               nova_flags |= FS_IMMUTABLE_FL;
+       if (flags & S_NOATIME)
+               nova_flags |= FS_NOATIME_FL;
+       if (flags & S_DIRSYNC)
+               nova_flags |= FS_DIRSYNC_FL;
+
+       pi->i_flags = cpu_to_le32(nova_flags);
+}
+
+static void nova_init_inode(struct inode *inode, struct nova_inode *pi)
+{
+       pi->i_mode = cpu_to_le16(inode->i_mode);
+       pi->i_uid = cpu_to_le32(i_uid_read(inode));
+       pi->i_gid = cpu_to_le32(i_gid_read(inode));
+       pi->i_links_count = cpu_to_le16(inode->i_nlink);
+       pi->i_size = cpu_to_le64(inode->i_size);
+       pi->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+       pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+       pi->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+       pi->i_generation = cpu_to_le32(inode->i_generation);
+       pi->log_head = 0;
+       pi->log_tail = 0;
+       pi->alter_log_head = 0;
+       pi->alter_log_tail = 0;
+       pi->deleted = 0;
+       pi->delete_epoch_id = 0;
+       nova_get_inode_flags(inode, pi);
+
+       if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+               pi->dev.rdev = cpu_to_le32(inode->i_rdev);
+}
+
+static int nova_alloc_unused_inode(struct super_block *sb, int cpuid,
+       unsigned long *ino)
+{
+       struct nova_sb_info *sbi = NOVA_SB(sb);
+       struct inode_map *inode_map;
+       struct nova_range_node *i, *next_i;
+       struct rb_node *temp, *next;
+       unsigned long next_range_low;
+       unsigned long new_ino;
+       unsigned long MAX_INODE = 1UL << 31;
+
+       inode_map = &sbi->inode_maps[cpuid];
+       i = inode_map->first_inode_range;
+       NOVA_ASSERT(i);
+       if (!nova_range_node_checksum_ok(i)) {
+               nova_dbg("%s: first node failed\n", __func__);
+               return -EIO;
+       }
+
+       temp = &i->node;
+       next = rb_next(temp);
+
+       if (!next) {
+               next_i = NULL;
+               next_range_low = MAX_INODE;
+       } else {
+               next_i = container_of(next, struct nova_range_node, node);
+               if (!nova_range_node_checksum_ok(next_i)) {
+                       nova_dbg("%s: second node failed\n", __func__);
+                       return -EIO;
+               }
+               next_range_low = next_i->range_low;
+       }
+
+       new_ino = i->range_high + 1;
+
+       if (next_i && new_ino == (next_range_low - 1)) {
+               /* Fill the gap completely */
+               i->range_high = next_i->range_high;
+               nova_update_range_node_checksum(i);
+               rb_erase(&next_i->node, &inode_map->inode_inuse_tree);
+               nova_free_inode_node(sb, next_i);
+               inode_map->num_range_node_inode--;
+       } else if (new_ino < (next_range_low - 1)) {
+               /* Aligns to left */
+               i->range_high = new_ino;
+               nova_update_range_node_checksum(i);
+       } else {
+               nova_dbg("%s: ERROR: new ino %lu, next low %lu\n", __func__,
+                       new_ino, next_range_low);
+               return -ENOSPC;
+       }
+
+       *ino = new_ino * sbi->cpus + cpuid;
+       sbi->s_inodes_used_count++;
+       inode_map->allocated++;
+
+       nova_dbg_verbose("Alloc ino %lu\n", *ino);
+       return 0;
+}
+
+static int nova_free_inuse_inode(struct super_block *sb, unsigned long ino)
+{
+       struct nova_sb_info *sbi = NOVA_SB(sb);
+       struct inode_map *inode_map;
+       struct nova_range_node *i = NULL;
+       struct nova_range_node *curr_node;
+       int found = 0;
+       int cpuid = ino % sbi->cpus;
+       unsigned long internal_ino = ino / sbi->cpus;
+       int ret = 0;
+
+       nova_dbg_verbose("Free inuse ino: %lu\n", ino);
+       inode_map = &sbi->inode_maps[cpuid];
+
+       mutex_lock(&inode_map->inode_table_mutex);
+       found = nova_search_inodetree(sbi, ino, &i);
+       if (!found) {
+               nova_dbg("%s ERROR: ino %lu not found\n", __func__, ino);
+               mutex_unlock(&inode_map->inode_table_mutex);
+               return -EINVAL;
+       }
+
+       if ((internal_ino == i->range_low) && (internal_ino == i->range_high)) {
+               /* fits entire node */
+               rb_erase(&i->node, &inode_map->inode_inuse_tree);
+               nova_free_inode_node(sb, i);
+               inode_map->num_range_node_inode--;
+               goto block_found;
+       }
+       if ((internal_ino == i->range_low) && (internal_ino < i->range_high)) {
+               /* Aligns left */
+               i->range_low = internal_ino + 1;
+               nova_update_range_node_checksum(i);
+               goto block_found;
+       }
+       if ((internal_ino > i->range_low) && (internal_ino == i->range_high)) {
+               /* Aligns right */
+               i->range_high = internal_ino - 1;
+               nova_update_range_node_checksum(i);
+               goto block_found;
+       }
+       if ((internal_ino > i->range_low) && (internal_ino < i->range_high)) {
+               /* Aligns somewhere in the middle */
+               curr_node = nova_alloc_inode_node(sb);
+               NOVA_ASSERT(curr_node);
+               if (curr_node == NULL) {
+                       /* returning without freeing the block */
+                       goto block_found;
+               }
+               curr_node->range_low = internal_ino + 1;
+               curr_node->range_high = i->range_high;
+               nova_update_range_node_checksum(curr_node);
+
+               i->range_high = internal_ino - 1;
+               nova_update_range_node_checksum(i);
+
+               ret = nova_insert_inodetree(sbi, curr_node, cpuid);
+               if (ret) {
+                       nova_free_inode_node(sb, curr_node);
+                       goto err;
+               }
+               inode_map->num_range_node_inode++;
+               goto block_found;
+       }
+
+err:
+       nova_error_mng(sb, "Unable to free inode %lu\n", ino);
+       nova_error_mng(sb, "Found inuse block %lu - %lu\n",
+                                i->range_low, i->range_high);
+       mutex_unlock(&inode_map->inode_table_mutex);
+       return ret;
+
+block_found:
+       sbi->s_inodes_used_count--;
+       inode_map->freed++;
+       mutex_unlock(&inode_map->inode_table_mutex);
+       return ret;
+}
+
+static int nova_free_inode(struct super_block *sb, struct nova_inode *pi,
+       struct nova_inode_info_header *sih)
+{
+       int err = 0;
+       timing_t free_time;
+
+       NOVA_START_TIMING(free_inode_t, free_time);
+
+       nova_free_inode_log(sb, pi, sih);
+
+       sih->log_pages = 0;
+       sih->i_mode = 0;
+       sih->pi_addr = 0;
+       sih->alter_pi_addr = 0;
+       sih->i_size = 0;
+       sih->i_blocks = 0;
+
+       err = nova_free_inuse_inode(sb, pi->nova_ino);
+
+       NOVA_END_TIMING(free_inode_t, free_time);
+       return err;
+}
+
+struct inode *nova_iget(struct super_block *sb, unsigned long ino)
+{
+       struct nova_inode_info *si;
+       struct inode *inode;
+       u64 pi_addr;
+       int err;
+
+       inode = iget_locked(sb, ino);
+       if (unlikely(!inode))
+               return ERR_PTR(-ENOMEM);
+       if (!(inode->i_state & I_NEW))
+               return inode;
+
+       si = NOVA_I(inode);
+
+       nova_dbgv("%s: inode %lu\n", __func__, ino);
+
+       err = nova_get_inode_address(sb, ino, 0, &pi_addr, 0, 0);
+       if (err) {
+               nova_dbg("%s: get inode %lu address failed %d\n",
+                        __func__, ino, err);
+               goto fail;
+       }
+
+       if (pi_addr == 0) {
+               nova_dbg("%s: failed to get pi_addr for inode %lu\n",
+                        __func__, ino);
+               err = -EACCES;
+               goto fail;
+       }
+
+       err = nova_rebuild_inode(sb, si, ino, pi_addr, 1);
+       if (err) {
+               nova_dbg("%s: failed to rebuild inode %lu\n", __func__, ino);
+               goto fail;
+       }
+
+       err = nova_read_inode(sb, inode, pi_addr);
+       if (unlikely(err)) {
+               nova_dbg("%s: failed to read inode %lu\n", __func__, ino);
+               goto fail;
+
+       }
+
+       inode->i_ino = ino;
+
+       unlock_new_inode(inode);
+       return inode;
+fail:
+       iget_failed(inode);
+       return ERR_PTR(err);
+}
+
+unsigned long nova_get_last_blocknr(struct super_block *sb,
+       struct nova_inode_info_header *sih)
+{
+       struct nova_inode *pi, fake_pi;
+       unsigned long last_blocknr;
+       unsigned int btype;
+       unsigned int data_bits;
+       int ret;
+
+       ret = nova_get_reference(sb, sih->pi_addr, &fake_pi,
+                       (void **)&pi, sizeof(struct nova_inode));
+       if (ret) {
+               nova_dbg("%s: read pi @ 0x%lx failed\n",
+                               __func__, sih->pi_addr);
+               btype = 0;
+       } else {
+               btype = sih->i_blk_type;
+       }
+
+       data_bits = blk_type_to_shift[btype];
+
+       if (sih->i_size == 0)
+               last_blocknr = 0;
+       else
+               last_blocknr = (sih->i_size - 1) >> data_bits;
+
+       return last_blocknr;
+}
+
+static int nova_free_inode_resource(struct super_block *sb,
+       struct nova_inode *pi, struct nova_inode_info_header *sih)
+{
+       unsigned long last_blocknr;
+       int ret = 0;
+       int freed = 0;
+       struct nova_inode *alter_pi;
+
+       nova_memunlock_inode(sb, pi);
+       pi->deleted = 1;
+
+       if (pi->valid) {
+               nova_dbg("%s: inode %lu still valid\n",
+                               __func__, sih->ino);
+               pi->valid = 0;
+       }
+       nova_update_inode_checksum(pi);
+       if (metadata_csum && sih->alter_pi_addr) {
+               alter_pi = (struct nova_inode *)nova_get_block(sb,
+                                               sih->alter_pi_addr);
+               memcpy_to_pmem_nocache(alter_pi, pi, sizeof(struct nova_inode));
+       }
+       nova_memlock_inode(sb, pi);
+
+       /* We need the log to free the blocks from the b-tree */
+       switch (sih->i_mode & S_IFMT) {
+       case S_IFREG:
+               last_blocknr = nova_get_last_blocknr(sb, sih);
+               nova_dbgv("%s: file ino %lu\n", __func__, sih->ino);
+               freed = nova_delete_file_tree(sb, sih, 0,
+                                       last_blocknr, true, true, 0);
+               break;
+       case S_IFDIR:
+               nova_dbgv("%s: dir ino %lu\n", __func__, sih->ino);
+               nova_delete_dir_tree(sb, sih);
+               break;
+       case S_IFLNK:
+               /* Log will be freed later */
+               nova_dbgv("%s: symlink ino %lu\n",
+                               __func__, sih->ino);
+               freed = nova_delete_file_tree(sb, sih, 0, 0,
+                                               true, true, 0);
+               break;
+       default:
+               nova_dbgv("%s: special ino %lu\n",
+                               __func__, sih->ino);
+               break;
+       }
+
+       nova_dbg_verbose("%s: Freed %d\n", __func__, freed);
+       /* Then we can free the inode */
+       ret = nova_free_inode(sb, pi, sih);
+       if (ret)
+               nova_err(sb, "%s: free inode %lu failed\n",
+                               __func__, sih->ino);
+
+       return ret;
+}
+
+void nova_evict_inode(struct inode *inode)
+{
+       struct super_block *sb = inode->i_sb;
+       struct nova_inode *pi = nova_get_inode(sb, inode);
+       struct nova_inode_info *si = NOVA_I(inode);
+       struct nova_inode_info_header *sih = &si->header;
+       timing_t evict_time;
+       int destroy = 0;
+       int ret;
+
+       NOVA_START_TIMING(evict_inode_t, evict_time);
+       if (!sih) {
+               nova_err(sb, "%s: ino %lu sih is NULL!\n",
+                               __func__, inode->i_ino);
+               NOVA_ASSERT(0);
+               goto out;
+       }
+
+       // pi can be NULL if the file has already been deleted, but a handle
+       // remains.
+       if (pi && pi->nova_ino != inode->i_ino) {
+               nova_err(sb, "%s: inode %lu ino does not match: %llu\n",
+                               __func__, inode->i_ino, pi->nova_ino);
+               nova_dbg("inode size %llu, pi addr 0x%lx, pi head 0x%llx, tail 
0x%llx, mode %u\n",
+                               inode->i_size, sih->pi_addr, sih->log_head,
+                               sih->log_tail, pi->i_mode);
+               nova_dbg("sih: ino %lu, inode size %lu, mode %u, inode mode 
%u\n",
+                               sih->ino, sih->i_size,
+                               sih->i_mode, inode->i_mode);
+               nova_print_inode_log(sb, inode);
+       }
+
+       /* Check if this inode exists in at least one snapshot. */
+       if (pi && pi->valid == 0) {
+               ret = nova_append_inode_to_snapshot(sb, pi);
+               if (ret == 0)
+                       goto out;
+       }
+
+       nova_dbg_verbose("%s: %lu\n", __func__, inode->i_ino);
+       if (!inode->i_nlink && !is_bad_inode(inode)) {
+               if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+                       goto out;
+
+               if (pi) {
+                       ret = nova_free_inode_resource(sb, pi, sih);
+                       if (ret)
+                               goto out;
+               }
+
+               destroy = 1;
+               pi = NULL; /* we no longer own the nova_inode */
+
+               inode->i_mtime = inode->i_ctime = current_time(inode);
+               inode->i_size = 0;
+       }
+out:
+       if (destroy == 0) {
+               nova_dbgv("%s: destroying %lu\n", __func__, inode->i_ino);
+               nova_free_dram_resource(sb, sih);
+       }
+       /* TODO: Since we don't use page-cache, do we really need the following
+        * call?
+        */
+       truncate_inode_pages(&inode->i_data, 0);
+
+       clear_inode(inode);
+       NOVA_END_TIMING(evict_inode_t, evict_time);
+}
+
+/* First rebuild the inode tree, then free the blocks */
+int nova_delete_dead_inode(struct super_block *sb, u64 ino)
+{
+       struct nova_inode_info si;
+       struct nova_inode_info_header *sih;
+       struct nova_inode *pi;
+       u64 pi_addr = 0;
+       int err;
+
+       if (ino < NOVA_NORMAL_INODE_START) {
+               nova_dbg("%s: invalid inode %llu\n", __func__, ino);
+               return -EINVAL;
+       }
+
+       err = nova_get_inode_address(sb, ino, 0, &pi_addr, 0, 0);
+       if (err) {
+               nova_dbg("%s: get inode %llu address failed %d\n",
+                                       __func__, ino, err);
+               return -EINVAL;
+       }
+
+       if (pi_addr == 0)
+               return -EACCES;
+
+       memset(&si, 0, sizeof(struct nova_inode_info));
+       err = nova_rebuild_inode(sb, &si, ino, pi_addr, 0);
+       if (err)
+               return err;
+
+       pi = (struct nova_inode *)nova_get_block(sb, pi_addr);
+       sih = &si.header;
+
+       nova_dbgv("Delete dead inode %lu, log head 0x%llx, tail 0x%llx\n",
+                       sih->ino, sih->log_head, sih->log_tail);
+
+       return nova_free_inode_resource(sb, pi, sih);
+}
+
+/* Returns 0 on failure */
+u64 nova_new_nova_inode(struct super_block *sb, u64 *pi_addr)
+{
+       struct nova_sb_info *sbi = NOVA_SB(sb);
+       struct inode_map *inode_map;
+       unsigned long free_ino = 0;
+       int map_id;
+       u64 ino = 0;
+       int ret;
+       timing_t new_inode_time;
+
+       NOVA_START_TIMING(new_nova_inode_t, new_inode_time);
+       map_id = sbi->map_id;
+       sbi->map_id = (sbi->map_id + 1) % sbi->cpus;
+
+       inode_map = &sbi->inode_maps[map_id];
+
+       mutex_lock(&inode_map->inode_table_mutex);
+       ret = nova_alloc_unused_inode(sb, map_id, &free_ino);
+       if (ret) {
+               nova_dbg("%s: alloc inode number failed %d\n", __func__, ret);
+               mutex_unlock(&inode_map->inode_table_mutex);
+               return 0;
+       }
+
+       ret = nova_get_inode_address(sb, free_ino, 0, pi_addr, 1, 1);
+       if (ret) {
+               nova_dbg("%s: get inode address failed %d\n", __func__, ret);
+               mutex_unlock(&inode_map->inode_table_mutex);
+               return 0;
+       }
+
+       mutex_unlock(&inode_map->inode_table_mutex);
+
+       ino = free_ino;
+
+       NOVA_END_TIMING(new_nova_inode_t, new_inode_time);
+       return ino;
+}
+
+struct inode *nova_new_vfs_inode(enum nova_new_inode_type type,
+       struct inode *dir, u64 pi_addr, u64 ino, umode_t mode,
+       size_t size, dev_t rdev, const struct qstr *qstr, u64 epoch_id)
+{
+       struct super_block *sb;
+       struct nova_sb_info *sbi;
+       struct inode *inode;
+       struct nova_inode *diri = NULL;
+       struct nova_inode_info *si;
+       struct nova_inode_info_header *sih = NULL;
+       struct nova_inode *pi;
+       struct nova_inode *alter_pi;
+       int errval;
+       u64 alter_pi_addr = 0;
+       timing_t new_inode_time;
+
+       NOVA_START_TIMING(new_vfs_inode_t, new_inode_time);
+       sb = dir->i_sb;
+       sbi = (struct nova_sb_info *)sb->s_fs_info;
+       inode = new_inode(sb);
+       if (!inode) {
+               errval = -ENOMEM;
+               goto fail2;
+       }
+
+       inode_init_owner(inode, dir, mode);
+       inode->i_blocks = inode->i_size = 0;
+       inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+
+       inode->i_generation = atomic_add_return(1, &sbi->next_generation);
+       inode->i_size = size;
+
+       diri = nova_get_inode(sb, dir);
+       if (!diri) {
+               errval = -EACCES;
+               goto fail1;
+       }
+
+       if (metadata_csum) {
+               /* Get alternate inode address */
+               errval = nova_get_alter_inode_address(sb, ino, &alter_pi_addr);
+               if (errval)
+                       goto fail1;
+       }
+
+       pi = (struct nova_inode *)nova_get_block(sb, pi_addr);
+       nova_dbg_verbose("%s: allocating inode %llu @ 0x%llx\n",
+                                       __func__, ino, pi_addr);
+
+       /* chosen inode is in ino */
+       inode->i_ino = ino;
+
+       switch (type) {
+       case TYPE_CREATE:
+               inode->i_op = &nova_file_inode_operations;
+               inode->i_mapping->a_ops = &nova_aops_dax;
+               if (inplace_data_updates && wprotect == 0)
+                       inode->i_fop = &nova_dax_file_operations;
+               else
+                       inode->i_fop = &nova_wrap_file_operations;
+               break;
+       case TYPE_MKNOD:
+               init_special_inode(inode, mode, rdev);
+               inode->i_op = &nova_special_inode_operations;
+               break;
+       case TYPE_SYMLINK:
+               inode->i_op = &nova_symlink_inode_operations;
+               inode->i_mapping->a_ops = &nova_aops_dax;
+               break;
+       case TYPE_MKDIR:
+               inode->i_op = &nova_dir_inode_operations;
+               inode->i_fop = &nova_dir_operations;
+               inode->i_mapping->a_ops = &nova_aops_dax;
+               set_nlink(inode, 2);
+               break;
+       default:
+               nova_dbg("Unknown new inode type %d\n", type);
+               break;
+       }
+
+       /*
+        * Pi is part of the dir log so no transaction is needed,
+        * but we need to flush to NVMM.
+        */
+       nova_memunlock_inode(sb, pi);
+       pi->i_blk_type = NOVA_DEFAULT_BLOCK_TYPE;
+       pi->i_flags = nova_mask_flags(mode, diri->i_flags);
+       pi->nova_ino = ino;
+       pi->i_create_time = current_time(inode).tv_sec;
+       pi->create_epoch_id = epoch_id;
+       nova_init_inode(inode, pi);
+
+       if (metadata_csum) {
+               alter_pi = (struct nova_inode *)nova_get_block(sb,
+                                                               alter_pi_addr);
+               memcpy_to_pmem_nocache(alter_pi, pi, sizeof(struct nova_inode));
+       }
+
+       nova_memlock_inode(sb, pi);
+
+       si = NOVA_I(inode);
+       sih = &si->header;
+       nova_init_header(sb, sih, inode->i_mode);
+       sih->pi_addr = pi_addr;
+       sih->alter_pi_addr = alter_pi_addr;
+       sih->ino = ino;
+       sih->i_blk_type = NOVA_DEFAULT_BLOCK_TYPE;
+
+       nova_set_inode_flags(inode, pi, le32_to_cpu(pi->i_flags));
+
+       if (insert_inode_locked(inode) < 0) {
+               nova_err(sb, "nova_new_inode failed ino %lx\n", inode->i_ino);
+               errval = -EINVAL;
+               goto fail1;
+       }
+
+       nova_flush_buffer(pi, NOVA_INODE_SIZE, 0);
+       NOVA_END_TIMING(new_vfs_inode_t, new_inode_time);
+       return inode;
+fail1:
+       make_bad_inode(inode);
+       iput(inode);
+fail2:
+       NOVA_END_TIMING(new_vfs_inode_t, new_inode_time);
+       return ERR_PTR(errval);
+}
+
+int nova_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+       /* write_inode should never be called because we always keep our inodes
+        * clean. So let us know if write_inode ever gets called.
+        */
+//     BUG();
+       return 0;
+}
+
+/*
+ * dirty_inode() is called from mark_inode_dirty_sync()
+ * usually dirty_inode should not be called because NOVA always keeps its 
inodes
+ * clean. Only exception is touch_atime which calls dirty_inode to update the
+ * i_atime field.
+ */
+void nova_dirty_inode(struct inode *inode, int flags)
+{
+       struct super_block *sb = inode->i_sb;
+       struct nova_sb_info *sbi = NOVA_SB(sb);
+       struct nova_inode_info *si = NOVA_I(inode);
+       struct nova_inode_info_header *sih = &si->header;
+       struct nova_inode *pi, inode_copy;
+
+       if (sbi->mount_snapshot)
+               return;
+
+       pi = nova_get_block(sb, sih->pi_addr);
+
+       /* check the inode before updating to make sure all fields are good */
+       if (nova_check_inode_integrity(sb, sih->ino, sih->pi_addr,
+                                       sih->alter_pi_addr, &inode_copy, 0) < 0)
+               return;
+
+       /* only i_atime should have changed if at all.
+        * we can do in-place atomic update
+        */
+       nova_memunlock_inode(sb, pi);
+       pi->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+       nova_update_inode_checksum(pi);
+       nova_update_alter_inode(sb, inode, pi);
+       nova_memlock_inode(sb, pi);
+       /* Relax atime persistency */
+       nova_flush_buffer(&pi->i_atime, sizeof(pi->i_atime), 0);
+}
+
+static void nova_setsize(struct inode *inode, loff_t oldsize, loff_t newsize,
+       u64 epoch_id)
+{
+       struct super_block *sb = inode->i_sb;
+       struct nova_inode_info *si = NOVA_I(inode);
+       struct nova_inode_info_header *sih = &si->header;
+       timing_t setsize_time;
+
+       /* We only support truncate regular file */
+       if (!(S_ISREG(inode->i_mode))) {
+               nova_err(inode->i_sb, "%s:wrong file mode %x\n", inode->i_mode);
+               return;
+       }
+
+       NOVA_START_TIMING(setsize_t, setsize_time);
+
+       inode_dio_wait(inode);
+
+       nova_dbgv("%s: inode %lu, old size %llu, new size %llu\n",
+               __func__, inode->i_ino, oldsize, newsize);
+
+       if (newsize != oldsize) {
+               nova_clear_last_page_tail(sb, inode, newsize);
+               i_size_write(inode, newsize);
+               sih->i_size = newsize;
+       }
+
+       /* FIXME: we should make sure that there is nobody reading the inode
+        * before truncating it. Also we need to munmap the truncated range
+        * from application address space, if mmapped.
+        */
+       /* synchronize_rcu(); */
+
+       /* FIXME: Do we need to clear truncated DAX pages? */
+//     dax_truncate_page(inode, newsize, nova_dax_get_block);
+
+       truncate_pagecache(inode, newsize);
+       nova_truncate_file_blocks(inode, newsize, oldsize, epoch_id);
+       NOVA_END_TIMING(setsize_t, setsize_time);
+}
+
+int nova_getattr(const struct path *path, struct kstat *stat,
+                u32 request_mask, unsigned int flags)
+{
+       struct inode *inode;
+
+       inode = path->dentry->d_inode;
+       generic_fillattr(inode, stat);
+       /* stat->blocks should be the number of 512B blocks */
+       stat->blocks = (inode->i_blocks << inode->i_sb->s_blocksize_bits) >> 9;
+       return 0;
+}
+
+int nova_notify_change(struct dentry *dentry, struct iattr *attr)
+{
+       struct inode *inode = dentry->d_inode;
+       struct nova_inode_info *si = NOVA_I(inode);
+       struct nova_inode_info_header *sih = &si->header;
+       struct super_block *sb = inode->i_sb;
+       struct nova_inode *pi = nova_get_inode(sb, inode);
+       int ret;
+       unsigned int ia_valid = attr->ia_valid, attr_mask;
+       loff_t oldsize = inode->i_size;
+       u64 epoch_id;
+       timing_t setattr_time;
+
+       NOVA_START_TIMING(setattr_t, setattr_time);
+       if (!pi) {
+               ret = -EACCES;
+               goto out;
+       }
+
+       ret = setattr_prepare(dentry, attr);
+       if (ret)
+               goto out;
+
+       /* Update inode with attr except for size */
+       setattr_copy(inode, attr);
+
+       epoch_id = nova_get_epoch_id(sb);
+
+       attr_mask = ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_SIZE | ATTR_ATIME
+                       | ATTR_MTIME | ATTR_CTIME;
+
+       ia_valid = ia_valid & attr_mask;
+
+       if (ia_valid == 0)
+               goto out;
+
+       ret = nova_handle_setattr_operation(sb, inode, pi, ia_valid,
+                                       attr, epoch_id);
+       if (ret)
+               goto out;
+
+       /* Only after log entry is committed, we can truncate size */
+       if ((ia_valid & ATTR_SIZE) && (attr->ia_size != oldsize ||
+                       pi->i_flags & cpu_to_le32(NOVA_EOFBLOCKS_FL))) {
+//             nova_set_blocksize_hint(sb, inode, pi, attr->ia_size);
+
+               /* now we can freely truncate the inode */
+               nova_setsize(inode, oldsize, attr->ia_size, epoch_id);
+       }
+
+       sih->trans_id++;
+out:
+       NOVA_END_TIMING(setattr_t, setattr_time);
+       return ret;
+}
+
+void nova_set_inode_flags(struct inode *inode, struct nova_inode *pi,
+       unsigned int flags)
+{
+       inode->i_flags &=
+               ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC);
+       if (flags & FS_SYNC_FL)
+               inode->i_flags |= S_SYNC;
+       if (flags & FS_APPEND_FL)
+               inode->i_flags |= S_APPEND;
+       if (flags & FS_IMMUTABLE_FL)
+               inode->i_flags |= S_IMMUTABLE;
+       if (flags & FS_NOATIME_FL)
+               inode->i_flags |= S_NOATIME;
+       if (flags & FS_DIRSYNC_FL)
+               inode->i_flags |= S_DIRSYNC;
+       if (!pi->i_xattr)
+               inode_has_no_xattr(inode);
+       inode->i_flags |= S_DAX;
+}
+
+static int nova_legacy_get_blocks(struct inode *inode, sector_t iblock,
+       struct buffer_head *bh, int create)
+{
+       unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
+       bool new = false, boundary = false;
+       u32 bno;
+       int ret;
+
+       ret = nova_dax_get_blocks(inode, iblock, max_blocks, &bno, &new,
+                               &boundary, create, false);
+       if (ret <= 0)
+               return ret;
+
+       map_bh(bh, inode->i_sb, bno);
+       bh->b_size = ret << inode->i_blkbits;
+       return 0;
+}
+
+static ssize_t nova_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
+       struct file *filp = iocb->ki_filp;
+       struct address_space *mapping = filp->f_mapping;
+       struct inode *inode = mapping->host;
+       ssize_t ret;
+       timing_t dio_time;
+
+       if (WARN_ON_ONCE(IS_DAX(inode)))
+               return -EIO;
+
+       NOVA_START_TIMING(direct_IO_t, dio_time);
+
+       ret = blockdev_direct_IO(iocb, inode, iter, nova_legacy_get_blocks);
+
+       NOVA_END_TIMING(direct_IO_t, dio_time);
+       return ret;
+}
+
+/*
+ * find the file offset for SEEK_DATA/SEEK_HOLE
+ */
+unsigned long nova_find_region(struct inode *inode, loff_t *offset, int hole)
+{
+       struct nova_inode_info *si = NOVA_I(inode);
+       struct nova_inode_info_header *sih = &si->header;
+       unsigned int data_bits = blk_type_to_shift[sih->i_blk_type];
+       unsigned long first_blocknr, last_blocknr;
+       unsigned long blocks = 0, offset_in_block;
+       int data_found = 0, hole_found = 0;
+
+       if (*offset >= inode->i_size)
+               return -ENXIO;
+
+       if (!inode->i_blocks || !sih->i_size) {
+               if (hole)
+                       return inode->i_size;
+               else
+                       return -ENXIO;
+       }
+
+       offset_in_block = *offset & ((1UL << data_bits) - 1);
+
+       first_blocknr = *offset >> data_bits;
+       last_blocknr = inode->i_size >> data_bits;
+
+       nova_dbg_verbose("find_region offset %llx, first_blocknr %lx, 
last_blocknr %lx hole %d\n",
+                 *offset, first_blocknr, last_blocknr, hole);
+
+       blocks = nova_lookup_hole_in_range(inode->i_sb, sih,
+               first_blocknr, last_blocknr, &data_found, &hole_found, hole);
+
+       /* Searching data but only hole found till the end */
+       if (!hole && !data_found && hole_found)
+               return -ENXIO;
+
+       if (data_found && !hole_found) {
+               /* Searching data but we are already into them */
+               if (hole)
+                       /* Searching hole but only data found, go to the end */
+                       *offset = inode->i_size;
+               return 0;
+       }
+
+       /* Searching for hole, hole found and starting inside an hole */
+       if (hole && hole_found && !blocks) {
+               /* we found data after it */
+               if (!data_found)
+                       /* last hole */
+                       *offset = inode->i_size;
+               return 0;
+       }
+
+       if (offset_in_block) {
+               blocks--;
+               *offset += (blocks << data_bits) +
+                          ((1 << data_bits) - offset_in_block);
+       } else {
+               *offset += blocks << data_bits;
+       }
+
+       return 0;
+}
+
+static int nova_writepages(struct address_space *mapping,
+       struct writeback_control *wbc)
+{
+       int ret;
+       timing_t wp_time;
+
+       NOVA_START_TIMING(write_pages_t, wp_time);
+       ret = dax_writeback_mapping_range(mapping,
+                       mapping->host->i_sb->s_bdev, wbc);
+       NOVA_END_TIMING(write_pages_t, wp_time);
+       return ret;
+}
+
+const struct address_space_operations nova_aops_dax = {
+       .writepages             = nova_writepages,
+       .direct_IO              = nova_direct_IO,
+       /*.dax_mem_protect      = nova_dax_mem_protect,*/
+};
diff --git a/fs/nova/inode.h b/fs/nova/inode.h
new file mode 100644
index 000000000000..5ad69335799c
--- /dev/null
+++ b/fs/nova/inode.h
@@ -0,0 +1,389 @@
+#ifndef __INODE_H
+#define __INODE_H
+
+struct nova_inode_info_header;
+struct nova_inode;
+
+#include "super.h"
+#include "log.h"
+
+enum nova_new_inode_type {
+       TYPE_CREATE = 0,
+       TYPE_MKNOD,
+       TYPE_SYMLINK,
+       TYPE_MKDIR
+};
+
+
+/*
+ * Structure of an inode in PMEM
+ * Keep the inode size to within 120 bytes: We use the last eight bytes
+ * as inode table tail pointer.
+ */
+struct nova_inode {
+
+       /* first 40 bytes */
+       u8      i_rsvd;          /* reserved. used to be checksum */
+       u8      valid;           /* Is this inode valid? */
+       u8      deleted;         /* Is this inode deleted? */
+       u8      i_blk_type;      /* data block size this inode uses */
+       __le32  i_flags;         /* Inode flags */
+       __le64  i_size;          /* Size of data in bytes */
+       __le32  i_ctime;         /* Inode modification time */
+       __le32  i_mtime;         /* Inode b-tree Modification time */
+       __le32  i_atime;         /* Access time */
+       __le16  i_mode;          /* File mode */
+       __le16  i_links_count;   /* Links count */
+
+       __le64  i_xattr;         /* Extended attribute block */
+
+       /* second 40 bytes */
+       __le32  i_uid;           /* Owner Uid */
+       __le32  i_gid;           /* Group Id */
+       __le32  i_generation;    /* File version (for NFS) */
+       __le32  i_create_time;   /* Create time */
+       __le64  nova_ino;        /* nova inode number */
+
+       __le64  log_head;        /* Log head pointer */
+       __le64  log_tail;        /* Log tail pointer */
+
+       /* last 40 bytes */
+       __le64  alter_log_head;  /* Alternate log head pointer */
+       __le64  alter_log_tail;  /* Alternate log tail pointer */
+
+       __le64  create_epoch_id; /* Transaction ID when create */
+       __le64  delete_epoch_id; /* Transaction ID when deleted */
+
+       struct {
+               __le32 rdev;     /* major/minor # */
+       } dev;                   /* device inode */
+
+       __le32  csum;            /* CRC32 checksum */
+
+       /* Leave 8 bytes for inode table tail pointer */
+} __attribute((__packed__));
+
+/*
+ * Inode table.  It's a linked list of pages.
+ */
+struct inode_table {
+       __le64 log_head;
+};
+
+/*
+ * NOVA-specific inode state kept in DRAM
+ */
+struct nova_inode_info_header {
+       /* For files, tree holds a map from file offsets to
+        * write log entries.
+        *
+        * For directories, tree holds a map from a hash of the file name to
+        * dentry log entry.
+        */
+       struct radix_tree_root tree;
+       struct rb_root vma_tree;        /* Write vmas */
+       struct list_head list;          /* SB list of mmap sih */
+       int num_vmas;
+       unsigned short i_mode;          /* Dir or file? */
+       unsigned long log_pages;        /* Num of log pages */
+       unsigned long i_size;
+       unsigned long i_blocks;
+       unsigned long ino;
+       unsigned long pi_addr;
+       unsigned long alter_pi_addr;
+       unsigned long valid_entries;    /* For thorough GC */
+       unsigned long num_entries;      /* For thorough GC */
+       u64 last_setattr;               /* Last setattr entry */
+       u64 last_link_change;           /* Last link change entry */
+       u64 last_dentry;                /* Last updated dentry */
+       u64 trans_id;                   /* Transaction ID */
+       u64 log_head;                   /* Log head pointer */
+       u64 log_tail;                   /* Log tail pointer */
+       u64 alter_log_head;             /* Alternate log head pointer */
+       u64 alter_log_tail;             /* Alternate log tail pointer */
+       u8  i_blk_type;
+};
+
+/* For rebuild purpose, temporarily store pi infomation */
+struct nova_inode_rebuild {
+       u64     i_size;
+       u32     i_flags;        /* Inode flags */
+       u32     i_ctime;        /* Inode modification time */
+       u32     i_mtime;        /* Inode b-tree Modification time */
+       u32     i_atime;        /* Access time */
+       u32     i_uid;          /* Owner Uid */
+       u32     i_gid;          /* Group Id */
+       u32     i_generation;   /* File version (for NFS) */
+       u16     i_links_count;  /* Links count */
+       u16     i_mode;         /* File mode */
+       u64     trans_id;
+};
+
+/*
+ * DRAM state for inodes
+ */
+struct nova_inode_info {
+       struct nova_inode_info_header header;
+       struct inode vfs_inode;
+};
+
+
+static inline struct nova_inode_info *NOVA_I(struct inode *inode)
+{
+       return container_of(inode, struct nova_inode_info, vfs_inode);
+}
+
+static inline struct nova_inode *nova_get_alter_inode(struct super_block *sb,
+       struct inode *inode)
+{
+       struct nova_inode_info *si = NOVA_I(inode);
+       struct nova_inode_info_header *sih = &si->header;
+       struct nova_inode fake_pi;
+       void *addr;
+       int rc;
+
+       if (metadata_csum == 0)
+               return NULL;
+
+       addr = nova_get_block(sb, sih->alter_pi_addr);
+       rc = memcpy_mcsafe(&fake_pi, addr, sizeof(struct nova_inode));
+       if (rc)
+               return NULL;
+
+       return (struct nova_inode *)addr;
+}
+
+static inline int nova_update_alter_inode(struct super_block *sb,
+       struct inode *inode, struct nova_inode *pi)
+{
+       struct nova_inode *alter_pi;
+
+       if (metadata_csum == 0)
+               return 0;
+
+       alter_pi = nova_get_alter_inode(sb, inode);
+       if (!alter_pi)
+               return -EINVAL;
+
+       memcpy_to_pmem_nocache(alter_pi, pi, sizeof(struct nova_inode));
+       return 0;
+}
+
+
+static inline int nova_update_inode_checksum(struct nova_inode *pi)
+{
+       u32 crc = 0;
+
+       if (metadata_csum == 0)
+               return 0;
+
+       crc = nova_crc32c(~0, (__u8 *)pi,
+                       (sizeof(struct nova_inode) - sizeof(__le32)));
+
+       pi->csum = crc;
+       nova_flush_buffer(pi, sizeof(struct nova_inode), 1);
+       return 0;
+}
+
+static inline int nova_check_inode_checksum(struct nova_inode *pi)
+{
+       u32 crc = 0;
+
+       if (metadata_csum == 0)
+               return 0;
+
+       crc = nova_crc32c(~0, (__u8 *)pi,
+                       (sizeof(struct nova_inode) - sizeof(__le32)));
+
+       if (pi->csum == cpu_to_le32(crc))
+               return 0;
+       else
+               return 1;
+}
+
+
+
+static inline void nova_update_tail(struct nova_inode *pi, u64 new_tail)
+{
+       timing_t update_time;
+
+       NOVA_START_TIMING(update_tail_t, update_time);
+
+       PERSISTENT_BARRIER();
+       pi->log_tail = new_tail;
+       nova_flush_buffer(&pi->log_tail, CACHELINE_SIZE, 1);
+
+       NOVA_END_TIMING(update_tail_t, update_time);
+}
+
+static inline void nova_update_alter_tail(struct nova_inode *pi, u64 new_tail)
+{
+       timing_t update_time;
+
+       if (metadata_csum == 0)
+               return;
+
+       NOVA_START_TIMING(update_tail_t, update_time);
+
+       PERSISTENT_BARRIER();
+       pi->alter_log_tail = new_tail;
+       nova_flush_buffer(&pi->alter_log_tail, CACHELINE_SIZE, 1);
+
+       NOVA_END_TIMING(update_tail_t, update_time);
+}
+
+
+
+/* Update inode tails and checksums */
+static inline void nova_update_inode(struct super_block *sb,
+       struct inode *inode, struct nova_inode *pi,
+       struct nova_inode_update *update, int update_alter)
+{
+       struct nova_inode_info *si = NOVA_I(inode);
+       struct nova_inode_info_header *sih = &si->header;
+
+       sih->log_tail = update->tail;
+       sih->alter_log_tail = update->alter_tail;
+       nova_update_tail(pi, update->tail);
+       if (metadata_csum)
+               nova_update_alter_tail(pi, update->alter_tail);
+
+       nova_update_inode_checksum(pi);
+       if (inode && update_alter)
+               nova_update_alter_inode(sb, inode, pi);
+}
+
+
+static inline
+struct inode_table *nova_get_inode_table(struct super_block *sb,
+       int version, int cpu)
+{
+       struct nova_sb_info *sbi = NOVA_SB(sb);
+       int table_start;
+
+       if (cpu >= sbi->cpus)
+               return NULL;
+
+       if ((version & 0x1) == 0)
+               table_start = INODE_TABLE0_START;
+       else
+               table_start = INODE_TABLE1_START;
+
+       return (struct inode_table *)((char *)nova_get_block(sb,
+               NOVA_DEF_BLOCK_SIZE_4K * table_start) +
+               cpu * CACHELINE_SIZE);
+}
+
+static inline unsigned int
+nova_inode_blk_shift(struct nova_inode_info_header *sih)
+{
+       return blk_type_to_shift[sih->i_blk_type];
+}
+
+static inline uint32_t nova_inode_blk_size(struct nova_inode_info_header *sih)
+{
+       return blk_type_to_size[sih->i_blk_type];
+}
+
+static inline u64 nova_get_reserved_inode_addr(struct super_block *sb,
+       u64 inode_number)
+{
+       return (NOVA_DEF_BLOCK_SIZE_4K * RESERVE_INODE_START) +
+                       inode_number * NOVA_INODE_SIZE;
+}
+
+static inline u64 nova_get_alter_reserved_inode_addr(struct super_block *sb,
+       u64 inode_number)
+{
+       struct nova_sb_info *sbi = NOVA_SB(sb);
+
+       return nova_get_addr_off(sbi, sbi->replica_reserved_inodes_addr) +
+                       inode_number * NOVA_INODE_SIZE;
+}
+
+static inline struct nova_inode *nova_get_reserved_inode(struct super_block 
*sb,
+       u64 inode_number)
+{
+       struct nova_sb_info *sbi = NOVA_SB(sb);
+       u64 addr;
+
+       addr = nova_get_reserved_inode_addr(sb, inode_number);
+
+       return (struct nova_inode *)(sbi->virt_addr + addr);
+}
+
+static inline struct nova_inode *
+nova_get_alter_reserved_inode(struct super_block *sb,
+       u64 inode_number)
+{
+       struct nova_sb_info *sbi = NOVA_SB(sb);
+       u64 addr;
+
+       addr = nova_get_alter_reserved_inode_addr(sb, inode_number);
+
+       return (struct nova_inode *)(sbi->virt_addr + addr);
+}
+
+/* If this is part of a read-modify-write of the inode metadata,
+ * nova_memunlock_inode() before calling!
+ */
+static inline struct nova_inode *nova_get_inode_by_ino(struct super_block *sb,
+                                                 u64 ino)
+{
+       if (ino == 0 || ino >= NOVA_NORMAL_INODE_START)
+               return NULL;
+
+       return nova_get_reserved_inode(sb, ino);
+}
+
+static inline struct nova_inode *nova_get_inode(struct super_block *sb,
+       struct inode *inode)
+{
+       struct nova_inode_info *si = NOVA_I(inode);
+       struct nova_inode_info_header *sih = &si->header;
+       struct nova_inode fake_pi;
+       void *addr;
+       int rc;
+
+       addr = nova_get_block(sb, sih->pi_addr);
+       rc = memcpy_mcsafe(&fake_pi, addr, sizeof(struct nova_inode));
+       if (rc)
+               return NULL;
+
+       return (struct nova_inode *)addr;
+}
+
+
+
+extern const struct address_space_operations nova_aops_dax;
+int nova_init_inode_inuse_list(struct super_block *sb);
+extern int nova_init_inode_table(struct super_block *sb);
+int nova_get_alter_inode_address(struct super_block *sb, u64 ino,
+       u64 *alter_pi_addr);
+unsigned long nova_get_last_blocknr(struct super_block *sb,
+       struct nova_inode_info_header *sih);
+int nova_get_inode_address(struct super_block *sb, u64 ino, int version,
+       u64 *pi_addr, int extendable, int extend_alternate);
+int nova_set_blocksize_hint(struct super_block *sb, struct inode *inode,
+       struct nova_inode *pi, loff_t new_size);
+extern struct inode *nova_iget(struct super_block *sb, unsigned long ino);
+extern void nova_evict_inode(struct inode *inode);
+extern int nova_write_inode(struct inode *inode, struct writeback_control 
*wbc);
+extern void nova_dirty_inode(struct inode *inode, int flags);
+extern int nova_notify_change(struct dentry *dentry, struct iattr *attr);
+extern int nova_getattr(const struct path *path, struct kstat *stat,
+                       u32 request_mask, unsigned int flags);
+extern void nova_set_inode_flags(struct inode *inode, struct nova_inode *pi,
+       unsigned int flags);
+extern unsigned long nova_find_region(struct inode *inode, loff_t *offset,
+               int hole);
+int nova_delete_file_tree(struct super_block *sb,
+       struct nova_inode_info_header *sih, unsigned long start_blocknr,
+       unsigned long last_blocknr, bool delete_nvmm,
+       bool delete_dead, u64 trasn_id);
+u64 nova_new_nova_inode(struct super_block *sb, u64 *pi_addr);
+extern struct inode *nova_new_vfs_inode(enum nova_new_inode_type,
+       struct inode *dir, u64 pi_addr, u64 ino, umode_t mode,
+       size_t size, dev_t rdev, const struct qstr *qstr, u64 epoch_id);
+
+#endif

Reply via email to