[PATCH 09/10] libbtrfsutil: bump version to 1.1.0

2018-11-13 Thread Omar Sandoval
From: Omar Sandoval 

With the previous few fixes and features, we should bump the minor
version.

Signed-off-by: Omar Sandoval 
---
 libbtrfsutil/btrfsutil.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libbtrfsutil/btrfsutil.h b/libbtrfsutil/btrfsutil.h
index d88c39e5..ad4f043e 100644
--- a/libbtrfsutil/btrfsutil.h
+++ b/libbtrfsutil/btrfsutil.h
@@ -26,7 +26,7 @@
 #include 
 
 #define BTRFS_UTIL_VERSION_MAJOR 1
-#define BTRFS_UTIL_VERSION_MINOR 0
+#define BTRFS_UTIL_VERSION_MINOR 1
 #define BTRFS_UTIL_VERSION_PATCH 0
 
 #ifdef __cplusplus
-- 
2.19.1



[PATCH 10/10] libbtrfsutil: document API in README

2018-11-13 Thread Omar Sandoval
From: Omar Sandoval 

btrfsutil.h and the Python docstrings are thorough, but I've gotten a
couple of requests for a high-level overview of the available interfaces
and example usages. Add them to README.md.

Signed-off-by: Omar Sandoval 
---
 libbtrfsutil/README.md | 422 -
 1 file changed, 421 insertions(+), 1 deletion(-)

diff --git a/libbtrfsutil/README.md b/libbtrfsutil/README.md
index 0c8eba44..30ae39b6 100644
--- a/libbtrfsutil/README.md
+++ b/libbtrfsutil/README.md
@@ -6,6 +6,425 @@ the LGPL. libbtrfsutil provides interfaces for a subset of 
the operations
 offered by the `btrfs` command line utility. It also includes official Python
 bindings (Python 3 only).
 
+API Overview
+
+
+This section provides an overview of the interfaces available in libbtrfsutil
+as well as example usages. Detailed documentation for the C API can be found in
+[`btrfsutil.h`](btrfsutil.h). Detailed documentation for the Python bindings is
+available with `pydoc3 btrfsutil` or in the interpreter:
+
+```
+>>> import btrfsutil
+>>> help(btrfsutil)
+```
+
+Many functions in the C API have a variant taking a path and a variant taking a
+file descriptor. The latter has the same name as the former with an `_fd`
+suffix. The Python bindings for these functions can take a path, a file object,
+or a file descriptor.
+
+Error handling is omitted from most of these examples for brevity. Please
+handle errors in production code.
+
+### Error Handling
+
+In the C API, all functions that can return an error return an `enum
+btrfs_util_error` and set `errno`. `BTRFS_UTIL_OK` (zero) is returned on
+success. `btrfs_util_strerror()` converts an error code to a string
+description suitable for human-friendly error reporting.
+
+```c
+enum btrfs_util_err err;
+
+err = btrfs_util_sync("/");
+if (err)
+   fprintf("stderr, %s: %m\n", btrfs_util_strerror(err));
+```
+
+In the Python bindings, functions may raise a `BtrfsUtilError`, which is a
+subclass of `OSError` with an added `btrfsutilerror` error code member. Error
+codes are available as `ERROR_*` constants.
+
+```python
+try:
+btrfsutil.sync('/')
+except btrfsutil.BtrfsUtilError as e:
+print(e, file=sys.stderr)
+```
+
+### Filesystem Operations
+
+There are several operations which act on the entire filesystem.
+
+ Sync
+
+Btrfs can commit all caches for a specific filesystem to disk.
+
+`btrfs_util_sync()` forces a sync on the filesystem containing the given file
+and waits for it to complete.
+
+`btrfs_wait_sync()` waits for a previously started transaction to complete. The
+transaction is specified by ID, which may be zero to indicate the current
+transaction.
+
+`btrfs_start_sync()` asynchronously starts a sync and returns a transaction ID
+which can then be passed to `btrfs_wait_sync()`.
+
+```c
+uint64_t transid;
+btrfs_util_sync("/");
+btrfs_util_start_sync("/", );
+btrfs_util_wait_sync("/", );
+btrfs_util_wait_sync("/", 0);
+```
+
+```python
+btrfsutil.sync('/')
+transid = btrfsutil.start_sync('/')
+btrfsutil.wait_sync('/', transid)
+btrfsutil.wait_sync('/')  # equivalent to wait_sync('/', 0)
+```
+
+All of these functions have `_fd` variants.
+
+The equivalent `btrfs-progs` command is `btrfs filesystem sync`.
+
+### Subvolume Operations
+
+Functions which take a file and a subvolume ID can be used in two ways. If zero
+is given as the subvolume ID, then the given file is used as the subvolume.
+Otherwise, the given file can be any file in the filesystem, and the subvolume
+with the given ID is used.
+
+ Subvolume Information
+
+`btrfs_util_is_subvolume()` returns whether a given file is a subvolume.
+
+`btrfs_util_subvolume_id()` returns the ID of the subvolume containing the
+given file.
+
+```c
+enum btrfs_util_error err;
+err = btrfs_util_is_subvolume("/subvol");
+if (!err)
+   printf("Subvolume\n");
+else if (err == BTRFS_UTIL_ERROR_NOT_BTRFS || err == 
BTRFS_UTIL_ERROR_NOT_SUBVOLUME)
+   printf("Not subvolume\n");
+uint64_t id;
+btrfs_util_subvolume_id("/subvol", );
+```
+
+```python
+if btrfsutil.is_subvolume('/subvol'):
+print('Subvolume')
+else:
+print('Not subvolume')
+id_ = btrfsutil.subvolume_id('/subvol')
+```
+
+`btrfs_util_subvolume_path()` returns the path of the subvolume with the given
+ID relative to the filesystem root. This requires `CAP_SYS_ADMIN`. The path
+must be freed with `free()`.
+
+```c
+char *path;
+btrfs_util_subvolume_path("/", 256, );
+free(path);
+btrfs_util_subvolume_path("/subvol", 0, );
+free(path);
+```
+
+```python
+path = btrfsutil.subvolume_path('/', 256)
+path = btrfsutil.subvolume_path('/subvol')  # equivalent to 
subvolume_path('/subvol', 0)
+```
+
+`btrfs_util_subvolume_info()` returns information (including ID, parent ID,
+UUID) about a subvolume. In the C API, this is returned as a `struct
+btrfs_util_subvolume_info`. The Python bindings use a `SubvolumeInfo` object.
+
+This requires `CAP_SYS_ADMIN` unless the given subvolume ID is zero and the

[PATCH 07/10] libbtrfsutil: relax the privileges of subvolume_info()

2018-11-13 Thread Omar Sandoval
From: Omar Sandoval 

Attempt to use the BTRFS_IOC_GET_SUBVOL_INFO ioctl (added in kernel
4.18) for subvolume_info() if not root. Also, rename
get_subvolume_info_root() -> get_subvolume_info_privileged() for
consistency with further changes.

This is based on a patch from Misono Tomohiro.

Signed-off-by: Omar Sandoval 
---
 libbtrfsutil/btrfsutil.h|  4 +-
 libbtrfsutil/errors.c   |  2 +
 libbtrfsutil/python/tests/test_subvolume.py | 42 
 libbtrfsutil/subvolume.c| 53 +++--
 4 files changed, 89 insertions(+), 12 deletions(-)

diff --git a/libbtrfsutil/btrfsutil.h b/libbtrfsutil/btrfsutil.h
index 6d655f49..c1925007 100644
--- a/libbtrfsutil/btrfsutil.h
+++ b/libbtrfsutil/btrfsutil.h
@@ -63,6 +63,7 @@ enum btrfs_util_error {
BTRFS_UTIL_ERROR_SYNC_FAILED,
BTRFS_UTIL_ERROR_START_SYNC_FAILED,
BTRFS_UTIL_ERROR_WAIT_SYNC_FAILED,
+   BTRFS_UTIL_ERROR_GET_SUBVOL_INFO_FAILED,
 };
 
 /**
@@ -266,7 +267,8 @@ struct btrfs_util_subvolume_info {
  * to check whether the subvolume exists; %BTRFS_UTIL_ERROR_SUBVOLUME_NOT_FOUND
  * will be returned if it does not.
  *
- * This requires appropriate privilege (CAP_SYS_ADMIN).
+ * This requires appropriate privilege (CAP_SYS_ADMIN) unless @id is zero and
+ * the kernel supports BTRFS_IOC_GET_SUBVOL_INFO (kernel >= 4.18).
  *
  * Return: %BTRFS_UTIL_OK on success, non-zero error code on failure.
  */
diff --git a/libbtrfsutil/errors.c b/libbtrfsutil/errors.c
index 634edc65..cf968b03 100644
--- a/libbtrfsutil/errors.c
+++ b/libbtrfsutil/errors.c
@@ -45,6 +45,8 @@ static const char * const error_messages[] = {
[BTRFS_UTIL_ERROR_SYNC_FAILED] = "Could not sync filesystem",
[BTRFS_UTIL_ERROR_START_SYNC_FAILED] = "Could not start filesystem 
sync",
[BTRFS_UTIL_ERROR_WAIT_SYNC_FAILED] = "Could not wait for filesystem 
sync",
+   [BTRFS_UTIL_ERROR_GET_SUBVOL_INFO_FAILED] =
+   "Could not get subvolume information with 
BTRFS_IOC_GET_SUBVOL_INFO",
 };
 
 PUBLIC const char *btrfs_util_strerror(enum btrfs_util_error err)
diff --git a/libbtrfsutil/python/tests/test_subvolume.py 
b/libbtrfsutil/python/tests/test_subvolume.py
index 4049b08e..55ebf34d 100644
--- a/libbtrfsutil/python/tests/test_subvolume.py
+++ b/libbtrfsutil/python/tests/test_subvolume.py
@@ -23,7 +23,12 @@ from pathlib import PurePath
 import traceback
 
 import btrfsutil
-from tests import BtrfsTestCase, HAVE_PATH_LIKE
+from tests import (
+BtrfsTestCase,
+drop_privs,
+HAVE_PATH_LIKE,
+skipUnlessHaveNobody,
+)
 
 
 class TestSubvolume(BtrfsTestCase):
@@ -87,7 +92,7 @@ class TestSubvolume(BtrfsTestCase):
 finally:
 os.chdir(pwd)
 
-def test_subvolume_info(self):
+def _test_subvolume_info(self, subvol, snapshot):
 for arg in self.path_or_fd(self.mountpoint):
 with self.subTest(type=type(arg)):
 info = btrfsutil.subvolume_info(arg)
@@ -100,7 +105,7 @@ class TestSubvolume(BtrfsTestCase):
 self.assertEqual(info.parent_uuid, bytes(16))
 self.assertEqual(info.received_uuid, bytes(16))
 self.assertNotEqual(info.generation, 0)
-self.assertEqual(info.ctransid, 0)
+self.assertGreaterEqual(info.ctransid, 0)
 self.assertEqual(info.otransid, 0)
 self.assertEqual(info.stransid, 0)
 self.assertEqual(info.rtransid, 0)
@@ -109,9 +114,6 @@ class TestSubvolume(BtrfsTestCase):
 self.assertEqual(info.stime, 0)
 self.assertEqual(info.rtime, 0)
 
-subvol = os.path.join(self.mountpoint, 'subvol')
-btrfsutil.create_subvolume(subvol)
-
 info = btrfsutil.subvolume_info(subvol)
 self.assertEqual(info.id, 256)
 self.assertEqual(info.parent_id, 5)
@@ -132,19 +134,43 @@ class TestSubvolume(BtrfsTestCase):
 self.assertEqual(info.rtime, 0)
 
 subvol_uuid = info.uuid
-snapshot = os.path.join(self.mountpoint, 'snapshot')
-btrfsutil.create_snapshot(subvol, snapshot)
 
 info = btrfsutil.subvolume_info(snapshot)
 self.assertEqual(info.parent_uuid, subvol_uuid)
 
 # TODO: test received_uuid, stransid, rtransid, stime, and rtime
 
+def test_subvolume_info(self):
+subvol = os.path.join(self.mountpoint, 'subvol')
+btrfsutil.create_subvolume(subvol)
+snapshot = os.path.join(self.mountpoint, 'snapshot')
+btrfsutil.create_snapshot(subvol, snapshot)
+
+self._test_subvolume_info(subvol, snapshot)
+
 for arg in self.path_or_fd(self.mountpoint):
 with self.subTest(type=type(arg)):
 with self.assertRaises(btrfsutil.BtrfsUtilError) as e:
 # BTRFS_EXTENT_TREE_OBJECTID
 btrfsutil.subvolume_info(arg, 2)
+self.assertEqual(e.exception.btrfsutilerror,
+ 

[PATCH 08/10] libbtrfsutil: relax the privileges of subvolume iterator

2018-11-13 Thread Omar Sandoval
From: Omar Sandoval 

We can use the new BTRFS_IOC_GET_SUBVOL_ROOTREF and
BTRFS_IOC_INO_LOOKUP_USER ioctls to allow non-root users to list
subvolumes.

This is based on a patch from Misono Tomohiro but takes a different
approach (mainly, this approach is more similar to the existing tree
search approach).

Signed-off-by: Omar Sandoval 
---
 libbtrfsutil/btrfsutil.h|  15 +-
 libbtrfsutil/errors.c   |   6 +
 libbtrfsutil/python/tests/test_subvolume.py | 180 +++---
 libbtrfsutil/subvolume.c| 354 +---
 4 files changed, 450 insertions(+), 105 deletions(-)

diff --git a/libbtrfsutil/btrfsutil.h b/libbtrfsutil/btrfsutil.h
index c1925007..d88c39e5 100644
--- a/libbtrfsutil/btrfsutil.h
+++ b/libbtrfsutil/btrfsutil.h
@@ -64,6 +64,9 @@ enum btrfs_util_error {
BTRFS_UTIL_ERROR_START_SYNC_FAILED,
BTRFS_UTIL_ERROR_WAIT_SYNC_FAILED,
BTRFS_UTIL_ERROR_GET_SUBVOL_INFO_FAILED,
+   BTRFS_UTIL_ERROR_GET_SUBVOL_ROOTREF_FAILED,
+   BTRFS_UTIL_ERROR_INO_LOOKUP_USER_FAILED,
+   BTRFS_UTIL_ERROR_FS_INFO_FAILED,
 };
 
 /**
@@ -507,6 +510,12 @@ struct btrfs_util_subvolume_iterator;
  * @flags: Bitmask of BTRFS_UTIL_SUBVOLUME_ITERATOR_* flags.
  * @ret: Returned iterator.
  *
+ * Subvolume iterators require appropriate privilege (CAP_SYS_ADMIN) unless 
@top
+ * is zero and the kernel supports BTRFS_IOC_GET_SUBVOL_ROOTREF and
+ * BTRFS_IOC_INO_LOOKUP_USER (kernel >= 4.18). In this case, subvolumes which
+ * cannot be accessed (e.g., due to permissions or other mounts) will be
+ * skipped.
+ *
  * The returned iterator must be freed with
  * btrfs_util_destroy_subvolume_iterator().
  *
@@ -555,7 +564,8 @@ int btrfs_util_subvolume_iterator_fd(const struct 
btrfs_util_subvolume_iterator
  * Must be freed with free().
  * @id_ret: Returned subvolume ID. May be %NULL.
  *
- * This requires appropriate privilege (CAP_SYS_ADMIN).
+ * This requires appropriate privilege (CAP_SYS_ADMIN) for kernels < 4.18. See
+ * btrfs_util_create_subvolume_iterator().
  *
  * Return: %BTRFS_UTIL_OK on success, %BTRFS_UTIL_ERROR_STOP_ITERATION if there
  * are no more subvolumes, non-zero error code on failure.
@@ -574,7 +584,8 @@ enum btrfs_util_error 
btrfs_util_subvolume_iterator_next(struct btrfs_util_subvo
  * This convenience function basically combines
  * btrfs_util_subvolume_iterator_next() and btrfs_util_subvolume_info().
  *
- * This requires appropriate privilege (CAP_SYS_ADMIN).
+ * This requires appropriate privilege (CAP_SYS_ADMIN) for kernels < 4.18. See
+ * btrfs_util_create_subvolume_iterator().
  *
  * Return: See btrfs_util_subvolume_iterator_next().
  */
diff --git a/libbtrfsutil/errors.c b/libbtrfsutil/errors.c
index cf968b03..d39b38d0 100644
--- a/libbtrfsutil/errors.c
+++ b/libbtrfsutil/errors.c
@@ -47,6 +47,12 @@ static const char * const error_messages[] = {
[BTRFS_UTIL_ERROR_WAIT_SYNC_FAILED] = "Could not wait for filesystem 
sync",
[BTRFS_UTIL_ERROR_GET_SUBVOL_INFO_FAILED] =
"Could not get subvolume information with 
BTRFS_IOC_GET_SUBVOL_INFO",
+   [BTRFS_UTIL_ERROR_GET_SUBVOL_ROOTREF_FAILED] =
+   "Could not get rootref information with 
BTRFS_IOC_GET_SUBVOL_ROOTREF",
+   [BTRFS_UTIL_ERROR_INO_LOOKUP_USER_FAILED] =
+   "Could not resolve subvolume path with 
BTRFS_IOC_INO_LOOKUP_USER",
+   [BTRFS_UTIL_ERROR_FS_INFO_FAILED] =
+   "Could not get filesystem information",
 };
 
 PUBLIC const char *btrfs_util_strerror(enum btrfs_util_error err)
diff --git a/libbtrfsutil/python/tests/test_subvolume.py 
b/libbtrfsutil/python/tests/test_subvolume.py
index 55ebf34d..99ec97bc 100644
--- a/libbtrfsutil/python/tests/test_subvolume.py
+++ b/libbtrfsutil/python/tests/test_subvolume.py
@@ -20,6 +20,7 @@ import errno
 import os
 import os.path
 from pathlib import PurePath
+import subprocess
 import traceback
 
 import btrfsutil
@@ -27,6 +28,8 @@ from tests import (
 BtrfsTestCase,
 drop_privs,
 HAVE_PATH_LIKE,
+NOBODY_UID,
+regain_privs,
 skipUnlessHaveNobody,
 )
 
@@ -354,69 +357,136 @@ class TestSubvolume(BtrfsTestCase):
 with self.subTest(type=type(arg)):
 self.assertEqual(btrfsutil.deleted_subvolumes(arg), [256])
 
-def test_subvolume_iterator(self):
-pwd = os.getcwd()
-try:
-os.chdir(self.mountpoint)
-btrfsutil.create_subvolume('foo')
-
-with btrfsutil.SubvolumeIterator('.', info=True) as it:
-path, subvol = next(it)
-self.assertEqual(path, 'foo')
-self.assertIsInstance(subvol, btrfsutil.SubvolumeInfo)
-self.assertEqual(subvol.id, 256)
-self.assertEqual(subvol.parent_id, 5)
-self.assertRaises(StopIteration, next, it)
-
-btrfsutil.create_subvolume('foo/bar')
-btrfsutil.create_subvolume('foo/bar/baz')
-
-subvols = 

[PATCH 06/10] libbtrfsutil: allow tests to create multiple Btrfs instances

2018-11-13 Thread Omar Sandoval
From: Omar Sandoval 

Some upcoming tests will need to create a second Btrfs filesystem, so
add support for this to the test helpers.

Signed-off-by: Omar Sandoval 
---
 libbtrfsutil/python/tests/__init__.py | 35 +--
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/libbtrfsutil/python/tests/__init__.py 
b/libbtrfsutil/python/tests/__init__.py
index 4bc11990..9fd6f6de 100644
--- a/libbtrfsutil/python/tests/__init__.py
+++ b/libbtrfsutil/python/tests/__init__.py
@@ -57,14 +57,18 @@ def regain_privs():
 
 @unittest.skipIf(os.geteuid() != 0, 'must be run as root')
 class BtrfsTestCase(unittest.TestCase):
-def setUp(self):
-self.mountpoint = tempfile.mkdtemp()
+def __init__(self, *args, **kwds):
+super().__init__(*args, **kwds)
+self._mountpoints = []
+
+def mount_btrfs(self):
+mountpoint = tempfile.mkdtemp()
 try:
 with tempfile.NamedTemporaryFile(delete=False) as f:
 os.truncate(f.fileno(), 1024 * 1024 * 1024)
-self.image = f.name
+image = f.name
 except Exception as e:
-os.rmdir(self.mountpoint)
+os.rmdir(mountpoint)
 raise e
 
 if os.path.exists('../../mkfs.btrfs'):
@@ -72,19 +76,24 @@ class BtrfsTestCase(unittest.TestCase):
 else:
 mkfs = 'mkfs.btrfs'
 try:
-subprocess.check_call([mkfs, '-q', self.image])
-subprocess.check_call(['mount', '-o', 'loop', '--', self.image, 
self.mountpoint])
+subprocess.check_call([mkfs, '-q', image])
+subprocess.check_call(['mount', '-o', 'loop', '--', image, 
mountpoint])
 except Exception as e:
-os.remove(self.image)
-os.rmdir(self.mountpoint)
+os.rmdir(mountpoint)
+os.remove(image)
 raise e
 
+self._mountpoints.append((mountpoint, image))
+return mountpoint, image
+
+def setUp(self):
+self.mountpoint, self.image = self.mount_btrfs()
+
 def tearDown(self):
-try:
-subprocess.check_call(['umount', self.mountpoint])
-finally:
-os.remove(self.image)
-os.rmdir(self.mountpoint)
+for mountpoint, image in self._mountpoints:
+subprocess.call(['umount', '-R', mountpoint])
+os.rmdir(mountpoint)
+os.remove(image)
 
 @staticmethod
 def path_or_fd(path, open_flags=os.O_RDONLY):
-- 
2.19.1



[PATCH 03/10] libbtrfsutil: document qgroup_inherit parameter in Python bindings

2018-11-13 Thread Omar Sandoval
From: Omar Sandoval 

This has been supported since day one, but it wasn't documented.

Signed-off-by: Omar Sandoval 
---
 libbtrfsutil/python/module.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/libbtrfsutil/python/module.c b/libbtrfsutil/python/module.c
index 625cc9c6..f8260c84 100644
--- a/libbtrfsutil/python/module.c
+++ b/libbtrfsutil/python/module.c
@@ -233,15 +233,18 @@ static PyMethodDef btrfsutil_methods[] = {
 "this ID instead of the given path"},
{"create_subvolume", (PyCFunction)create_subvolume,
 METH_VARARGS | METH_KEYWORDS,
-"create_subvolume(path, async_=False)\n\n"
+"create_subvolume(path, async_=False, qgroup_inherit=None)\n\n"
 "Create a new subvolume.\n\n"
 "Arguments:\n"
 "path -- string, bytes, or path-like object\n"
 "async_ -- create the subvolume without waiting for it to commit to\n"
-"disk and return the transaction ID"},
+"disk and return the transaction ID\n"
+"qgroup_inherit -- optional QgroupInherit object of qgroups to\n"
+"inherit from"},
{"create_snapshot", (PyCFunction)create_snapshot,
 METH_VARARGS | METH_KEYWORDS,
-"create_snapshot(source, path, recursive=False, read_only=False, 
async_=False)\n\n"
+"create_snapshot(source, path, recursive=False, read_only=False,\n"
+"async_=False, qgroup_inherit=None)\n\n"
 "Create a new snapshot.\n\n"
 "Arguments:\n"
 "source -- string, bytes, path-like object, or open file descriptor\n"
@@ -249,7 +252,9 @@ static PyMethodDef btrfsutil_methods[] = {
 "recursive -- also snapshot child subvolumes\n"
 "read_only -- create a read-only snapshot\n"
 "async_ -- create the subvolume without waiting for it to commit to\n"
-"disk and return the transaction ID"},
+"disk and return the transaction ID\n"
+"qgroup_inherit -- optional QgroupInherit object of qgroups to\n"
+"inherit from"},
{"delete_subvolume", (PyCFunction)delete_subvolume,
 METH_VARARGS | METH_KEYWORDS,
 "delete_subvolume(path, recursive=False)\n\n"
-- 
2.19.1



[PATCH 05/10] libbtrfsutil: add test helpers for dropping privileges

2018-11-13 Thread Omar Sandoval
From: Omar Sandoval 

These will be used for testing some upcoming changes which allow
unprivileged operations.

Signed-off-by: Omar Sandoval 
---
 libbtrfsutil/python/tests/__init__.py | 31 ++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/libbtrfsutil/python/tests/__init__.py 
b/libbtrfsutil/python/tests/__init__.py
index 35550e0a..4bc11990 100644
--- a/libbtrfsutil/python/tests/__init__.py
+++ b/libbtrfsutil/python/tests/__init__.py
@@ -15,14 +15,44 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with libbtrfsutil.  If not, see .
 
+import contextlib
 import os
 from pathlib import PurePath
+import pwd
 import subprocess
 import tempfile
 import unittest
 
 
 HAVE_PATH_LIKE = hasattr(PurePath, '__fspath__')
+try:
+NOBODY_UID = pwd.getpwnam('nobody').pw_uid
+skipUnlessHaveNobody = lambda func: func
+except KeyError:
+NOBODY_UID = None
+skipUnlessHaveNobody = unittest.skip('must have nobody user')
+
+
+@contextlib.contextmanager
+def drop_privs():
+try:
+os.seteuid(NOBODY_UID)
+yield
+finally:
+os.seteuid(0)
+
+
+@contextlib.contextmanager
+def regain_privs():
+uid = os.geteuid()
+if uid:
+try:
+os.seteuid(0)
+yield
+finally:
+os.seteuid(uid)
+else:
+yield
 
 
 @unittest.skipIf(os.geteuid() != 0, 'must be run as root')
@@ -67,4 +97,3 @@ class BtrfsTestCase(unittest.TestCase):
 yield fd
 finally:
 os.close(fd)
-
-- 
2.19.1



[PATCH 01/10] libbtrfsutil: use top=0 as default for SubvolumeIterator()

2018-11-13 Thread Omar Sandoval
From: Omar Sandoval 

Right now, we're defaulting to top=5 (i.e, all subvolumes). The
documented default is top=0 (i.e, only beneath the given path). This is
the expected behavior. Fix it and make the test cases cover it.

Reported-by: Jonathan Lemon 
Signed-off-by: Omar Sandoval 
---
 libbtrfsutil/python/subvolume.c | 2 +-
 libbtrfsutil/python/tests/test_subvolume.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/libbtrfsutil/python/subvolume.c b/libbtrfsutil/python/subvolume.c
index 069e606b..6ecde1f6 100644
--- a/libbtrfsutil/python/subvolume.c
+++ b/libbtrfsutil/python/subvolume.c
@@ -525,7 +525,7 @@ static int SubvolumeIterator_init(SubvolumeIterator *self, 
PyObject *args,
static char *keywords[] = {"path", "top", "info", "post_order", NULL};
struct path_arg path = {.allow_fd = true};
enum btrfs_util_error err;
-   unsigned long long top = 5;
+   unsigned long long top = 0;
int info = 0;
int post_order = 0;
int flags = 0;
diff --git a/libbtrfsutil/python/tests/test_subvolume.py 
b/libbtrfsutil/python/tests/test_subvolume.py
index 93396cba..0788a564 100644
--- a/libbtrfsutil/python/tests/test_subvolume.py
+++ b/libbtrfsutil/python/tests/test_subvolume.py
@@ -353,6 +353,7 @@ class TestSubvolume(BtrfsTestCase):
 with self.subTest(type=type(arg)):
 self.assertEqual(list(btrfsutil.SubvolumeIterator(arg)), 
subvols)
 self.assertEqual(list(btrfsutil.SubvolumeIterator('.', top=0)), 
subvols)
+self.assertEqual(list(btrfsutil.SubvolumeIterator('foo', top=5)), 
subvols)
 
 self.assertEqual(list(btrfsutil.SubvolumeIterator('.', 
post_order=True)),
  [('foo/bar/baz', 258),
@@ -365,6 +366,7 @@ class TestSubvolume(BtrfsTestCase):
 ]
 
 self.assertEqual(list(btrfsutil.SubvolumeIterator('.', top=256)), 
subvols)
+self.assertEqual(list(btrfsutil.SubvolumeIterator('foo')), subvols)
 self.assertEqual(list(btrfsutil.SubvolumeIterator('foo', top=0)), 
subvols)
 
 os.rename('foo/bar/baz', 'baz')
-- 
2.19.1



[PATCH 00/10] btrfs-progs: my libbtrfsutil patch queue

2018-11-13 Thread Omar Sandoval
From: Omar Sandoval 

Hi,

This series contains my backlog of libbtrfsutil changes which I've been
collecting over the past few weeks.

Patches 1-4 are fixes. Patches 5-6 add functionality to the unit tests
which is needed for patches 7-8. Patches 7-8 add support for the
unprivileged ioctls added in Linux 4.18; more on those below. Patch 9
bumps the library version. Patch 10 adds documentation for the available
API along with examples.

Patches 7-8 are based on Misono Tomohiro's previous patch series [1],
with a few important changes.

- Both subvolume_info() and create_subvolume_iterator() now have unit
  tests for the unprivileged case.
- Both no longer explicitly check that top == 0 in the unprivileged
  case, since that will already fail with a clear permission error.
- Unprivileged iteration is much simpler: it uses openat() instead of
  fchdir() and is based more closely on the original tree search
  variant. This fixes a bug in post-order iteration in Misono's version.
- Unprivileged iteration does _not_ support passing in a non-subvolume
  path; if this behavior is desired, I'd like it to be a separate change
  with an explicit flag.

Please take a look.

Thanks!

1: https://www.spinics.net/lists/linux-btrfs/msg79285.html

Omar Sandoval (10):
  libbtrfsutil: use top=0 as default for SubvolumeIterator()
  libbtrfsutil: change async parameters to async_ in Python bindings
  libbtrfsutil: document qgroup_inherit parameter in Python bindings
  libbtrfsutil: use SubvolumeIterator as context manager in tests
  libbtrfsutil: add test helpers for dropping privileges
  libbtrfsutil: allow tests to create multiple Btrfs instances
  libbtrfsutil: relax the privileges of subvolume_info()
  libbtrfsutil: relax the privileges of subvolume iterator
  libbtrfsutil: bump version to 1.1.0
  libbtrfsutil: document API in README

 libbtrfsutil/README.md  | 422 +++-
 libbtrfsutil/btrfsutil.h|  21 +-
 libbtrfsutil/errors.c   |   8 +
 libbtrfsutil/python/module.c|  17 +-
 libbtrfsutil/python/subvolume.c |   6 +-
 libbtrfsutil/python/tests/__init__.py   |  66 ++-
 libbtrfsutil/python/tests/test_subvolume.py | 215 +++---
 libbtrfsutil/subvolume.c| 407 ---
 8 files changed, 1029 insertions(+), 133 deletions(-)

-- 
2.19.1



[PATCH 02/10] libbtrfsutil: change async parameters to async_ in Python bindings

2018-11-13 Thread Omar Sandoval
From: Omar Sandoval 

async became a keyword in Python 3.7, so, e.g., create_subvolume('foo',
async=True) is now a syntax error. Fix it with the Python convention of
adding a trailing underscore to the keyword (async -> async_). This is
what several other Python libraries did to handle this.

Signed-off-by: Omar Sandoval 
---
 libbtrfsutil/python/module.c| 8 
 libbtrfsutil/python/subvolume.c | 4 ++--
 libbtrfsutil/python/tests/test_subvolume.py | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/libbtrfsutil/python/module.c b/libbtrfsutil/python/module.c
index 2dbdc7be..625cc9c6 100644
--- a/libbtrfsutil/python/module.c
+++ b/libbtrfsutil/python/module.c
@@ -233,22 +233,22 @@ static PyMethodDef btrfsutil_methods[] = {
 "this ID instead of the given path"},
{"create_subvolume", (PyCFunction)create_subvolume,
 METH_VARARGS | METH_KEYWORDS,
-"create_subvolume(path, async=False)\n\n"
+"create_subvolume(path, async_=False)\n\n"
 "Create a new subvolume.\n\n"
 "Arguments:\n"
 "path -- string, bytes, or path-like object\n"
-"async -- create the subvolume without waiting for it to commit to\n"
+"async_ -- create the subvolume without waiting for it to commit to\n"
 "disk and return the transaction ID"},
{"create_snapshot", (PyCFunction)create_snapshot,
 METH_VARARGS | METH_KEYWORDS,
-"create_snapshot(source, path, recursive=False, read_only=False, 
async=False)\n\n"
+"create_snapshot(source, path, recursive=False, read_only=False, 
async_=False)\n\n"
 "Create a new snapshot.\n\n"
 "Arguments:\n"
 "source -- string, bytes, path-like object, or open file descriptor\n"
 "path -- string, bytes, or path-like object\n"
 "recursive -- also snapshot child subvolumes\n"
 "read_only -- create a read-only snapshot\n"
-"async -- create the subvolume without waiting for it to commit to\n"
+"async_ -- create the subvolume without waiting for it to commit to\n"
 "disk and return the transaction ID"},
{"delete_subvolume", (PyCFunction)delete_subvolume,
 METH_VARARGS | METH_KEYWORDS,
diff --git a/libbtrfsutil/python/subvolume.c b/libbtrfsutil/python/subvolume.c
index 6ecde1f6..0f893b91 100644
--- a/libbtrfsutil/python/subvolume.c
+++ b/libbtrfsutil/python/subvolume.c
@@ -322,7 +322,7 @@ PyObject *set_default_subvolume(PyObject *self, PyObject 
*args, PyObject *kwds)
 
 PyObject *create_subvolume(PyObject *self, PyObject *args, PyObject *kwds)
 {
-   static char *keywords[] = {"path", "async", "qgroup_inherit", NULL};
+   static char *keywords[] = {"path", "async_", "qgroup_inherit", NULL};
struct path_arg path = {.allow_fd = false};
enum btrfs_util_error err;
int async = 0;
@@ -352,7 +352,7 @@ PyObject *create_subvolume(PyObject *self, PyObject *args, 
PyObject *kwds)
 PyObject *create_snapshot(PyObject *self, PyObject *args, PyObject *kwds)
 {
static char *keywords[] = {
-   "source", "path", "recursive", "read_only", "async",
+   "source", "path", "recursive", "read_only", "async_",
"qgroup_inherit", NULL,
};
struct path_arg src = {.allow_fd = true}, dst = {.allow_fd = false};
diff --git a/libbtrfsutil/python/tests/test_subvolume.py 
b/libbtrfsutil/python/tests/test_subvolume.py
index 0788a564..f2a4cdb8 100644
--- a/libbtrfsutil/python/tests/test_subvolume.py
+++ b/libbtrfsutil/python/tests/test_subvolume.py
@@ -202,7 +202,7 @@ class TestSubvolume(BtrfsTestCase):
 btrfsutil.create_subvolume(subvol + '6//')
 self.assertTrue(btrfsutil.is_subvolume(subvol + '6'))
 
-transid = btrfsutil.create_subvolume(subvol + '7', async=True)
+transid = btrfsutil.create_subvolume(subvol + '7', async_=True)
 self.assertTrue(btrfsutil.is_subvolume(subvol + '7'))
 self.assertGreater(transid, 0)
 
@@ -265,7 +265,7 @@ class TestSubvolume(BtrfsTestCase):
 btrfsutil.create_snapshot(subvol, snapshot + '2', recursive=True)
 self.assertTrue(os.path.exists(os.path.join(snapshot + '2', 
'nested/more_nested/nested_dir')))
 
-transid = btrfsutil.create_snapshot(subvol, snapshot + '3', 
recursive=True, async=True)
+transid = btrfsutil.create_snapshot(subvol, snapshot + '3', 
recursive=True, async_=True)
 self.assertTrue(os.path.exists(os.path.join(snapshot + '3', 
'nested/more_nested/nested_dir')))
 self.assertGreater(transid, 0)
 
-- 
2.19.1



[PATCH 04/10] libbtrfsutil: use SubvolumeIterator as context manager in tests

2018-11-13 Thread Omar Sandoval
From: Omar Sandoval 

We're leaking file descriptors, which makes it impossible to clean up
the temporary mount point created by the test.

Signed-off-by: Omar Sandoval 
---
 libbtrfsutil/python/tests/test_subvolume.py | 51 -
 1 file changed, 30 insertions(+), 21 deletions(-)

diff --git a/libbtrfsutil/python/tests/test_subvolume.py 
b/libbtrfsutil/python/tests/test_subvolume.py
index f2a4cdb8..4049b08e 100644
--- a/libbtrfsutil/python/tests/test_subvolume.py
+++ b/libbtrfsutil/python/tests/test_subvolume.py
@@ -334,11 +334,13 @@ class TestSubvolume(BtrfsTestCase):
 os.chdir(self.mountpoint)
 btrfsutil.create_subvolume('foo')
 
-path, subvol = next(btrfsutil.SubvolumeIterator('.', info=True))
-self.assertEqual(path, 'foo')
-self.assertIsInstance(subvol, btrfsutil.SubvolumeInfo)
-self.assertEqual(subvol.id, 256)
-self.assertEqual(subvol.parent_id, 5)
+with btrfsutil.SubvolumeIterator('.', info=True) as it:
+path, subvol = next(it)
+self.assertEqual(path, 'foo')
+self.assertIsInstance(subvol, btrfsutil.SubvolumeInfo)
+self.assertEqual(subvol.id, 256)
+self.assertEqual(subvol.parent_id, 5)
+self.assertRaises(StopIteration, next, it)
 
 btrfsutil.create_subvolume('foo/bar')
 btrfsutil.create_subvolume('foo/bar/baz')
@@ -350,30 +352,37 @@ class TestSubvolume(BtrfsTestCase):
 ]
 
 for arg in self.path_or_fd('.'):
-with self.subTest(type=type(arg)):
-self.assertEqual(list(btrfsutil.SubvolumeIterator(arg)), 
subvols)
-self.assertEqual(list(btrfsutil.SubvolumeIterator('.', top=0)), 
subvols)
-self.assertEqual(list(btrfsutil.SubvolumeIterator('foo', top=5)), 
subvols)
-
-self.assertEqual(list(btrfsutil.SubvolumeIterator('.', 
post_order=True)),
- [('foo/bar/baz', 258),
-  ('foo/bar', 257),
-  ('foo', 256)])
+with self.subTest(type=type(arg)), 
btrfsutil.SubvolumeIterator(arg) as it:
+self.assertEqual(list(it), subvols)
+with btrfsutil.SubvolumeIterator('.', top=0) as it:
+self.assertEqual(list(it), subvols)
+with btrfsutil.SubvolumeIterator('foo', top=5) as it:
+self.assertEqual(list(it), subvols)
+
+with btrfsutil.SubvolumeIterator('.', post_order=True) as it:
+self.assertEqual(list(it),
+ [('foo/bar/baz', 258),
+  ('foo/bar', 257),
+  ('foo', 256)])
 
 subvols = [
 ('bar', 257),
 ('bar/baz', 258),
 ]
 
-self.assertEqual(list(btrfsutil.SubvolumeIterator('.', top=256)), 
subvols)
-self.assertEqual(list(btrfsutil.SubvolumeIterator('foo')), subvols)
-self.assertEqual(list(btrfsutil.SubvolumeIterator('foo', top=0)), 
subvols)
+with btrfsutil.SubvolumeIterator('.', top=256) as it:
+self.assertEqual(list(it), subvols)
+with btrfsutil.SubvolumeIterator('foo') as it:
+self.assertEqual(list(it), subvols)
+with btrfsutil.SubvolumeIterator('foo', top=0) as it:
+self.assertEqual(list(it), subvols)
 
 os.rename('foo/bar/baz', 'baz')
-self.assertEqual(sorted(btrfsutil.SubvolumeIterator('.')),
- [('baz', 258),
-  ('foo', 256),
-  ('foo/bar', 257)])
+with btrfsutil.SubvolumeIterator('.') as it:
+self.assertEqual(sorted(it),
+ [('baz', 258),
+  ('foo', 256),
+  ('foo/bar', 257)])
 
 with btrfsutil.SubvolumeIterator('.') as it:
 self.assertGreaterEqual(it.fileno(), 0)
-- 
2.19.1



Re: [PATCH] btrfs: Fix suspicious RCU usage warning in device_list_add

2018-11-13 Thread Nikolay Borisov



On 14.11.18 г. 9:24 ч., Lu Fengqi wrote:
> =
> WARNING: suspicious RCU usage
> 4.20.0-rc2+ #23 Tainted: G   O
> -
> fs/btrfs/volumes.c:886 suspicious rcu_dereference_check() usage!
> 
> Use btrfs_info_in_rcu instead of pr_info for the required lock/unlock of
> RCU string.
> 
> Fixes: 1f265fc6f58b ("btrfs: harden agaist duplicate fsid on scanned devices")
> Signed-off-by: Lu Fengqi 

Reviewed-by: Nikolay Borisov 

> ---
>  fs/btrfs/volumes.c | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 2186300bab91..6039ae5c549e 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -873,15 +873,15 @@ static noinline struct btrfs_device 
> *device_list_add(const char *path,
>   if (device->bdev != path_bdev) {
>   bdput(path_bdev);
>   mutex_unlock(_devices->device_list_mutex);
> - pr_warn(
> - "BTRFS: duplicate device fsid:devid for %pU:%llu old:%s 
> new:%s\n",
> + btrfs_warn_in_rcu(device->fs_info,
> + "duplicate device fsid:devid for %pU:%llu old:%s 
> new:%s\n",
>   disk_super->fsid, devid,
>   rcu_str_deref(device->name), path);
>   return ERR_PTR(-EEXIST);
>   }
>   bdput(path_bdev);
> - pr_info(
> - "BTRFS: device fsid %pU devid %llu moved old:%s 
> new:%s\n",
> + btrfs_info_in_rcu(device->fs_info,
> + "device fsid %pU devid %llu moved old:%s new:%s\n",
>   disk_super->fsid, devid,
>   rcu_str_deref(device->name), path);
>   }
> 


[PATCH] btrfs: Fix suspicious RCU usage warning in device_list_add

2018-11-13 Thread Lu Fengqi
=
WARNING: suspicious RCU usage
4.20.0-rc2+ #23 Tainted: G   O
-
fs/btrfs/volumes.c:886 suspicious rcu_dereference_check() usage!

Use btrfs_info_in_rcu instead of pr_info for the required lock/unlock of
RCU string.

Fixes: 1f265fc6f58b ("btrfs: harden agaist duplicate fsid on scanned devices")
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/volumes.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2186300bab91..6039ae5c549e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -873,15 +873,15 @@ static noinline struct btrfs_device 
*device_list_add(const char *path,
if (device->bdev != path_bdev) {
bdput(path_bdev);
mutex_unlock(_devices->device_list_mutex);
-   pr_warn(
-   "BTRFS: duplicate device fsid:devid for %pU:%llu old:%s 
new:%s\n",
+   btrfs_warn_in_rcu(device->fs_info,
+   "duplicate device fsid:devid for %pU:%llu old:%s 
new:%s\n",
disk_super->fsid, devid,
rcu_str_deref(device->name), path);
return ERR_PTR(-EEXIST);
}
bdput(path_bdev);
-   pr_info(
-   "BTRFS: device fsid %pU devid %llu moved old:%s 
new:%s\n",
+   btrfs_info_in_rcu(device->fs_info,
+   "device fsid %pU devid %llu moved old:%s new:%s\n",
disk_super->fsid, devid,
rcu_str_deref(device->name), path);
}
-- 
2.19.1





[PATCH] btrfs: qgroup: Move reserved data account from btrfs_delayed_ref_head to btrfs_qgroup_extent_record

2018-11-13 Thread Qu Wenruo
[BUG]
Btrfs/139 will fail with a pretty high possibility if the testing
machine (VM) only has 2G ram.

Resulting the final write success while it should fail due to EDQUOT,
and the result fs will has quota exceeding the limit by 16K.

The simplified reproducer will be: (needs a 2G ram VM)

  mkfs.btrfs -f $dev
  mount $dev $mnt

  btrfs subv create $mnt/subv
  btrfs quota enable $mnt
  btrfs quota rescan -w $mnt
  btrfs qgroup limit -e 1G $mnt/subv

  for i in $(seq -w  1 8); do
xfs_io -f -c "pwrite 0 128M" $mnt/subv/file_$i > /dev/null
echo "file $i written" > /dev/kmsg
  done
  sync
  btrfs qgroup show -pcre --raw $mnt

The last pwrite will not trigger EDQUOT and final qgroup show will show
something like:

  qgroupid rfer excl max_rfer max_excl parent  child
       --  -
  0/5 1638416384 none none --- ---
  0/256  1073758208   1073758208 none   1073741824 --- ---

And 1073758208 is larger than
  > 1073741824.

[CAUSE]
It's a bug in btrfs qgroup data reserved space management.

For quota limit, we must ensure that:
  reserved (data + metadata) + rfer/excl <= limit

Since rfer/excl is only updated at transaction commmit time, reserved
space needs to be taken special care.

One important part of reserved space is data, and for a new data extent
written to disk, we still need to take the reserved space until
rfer/excl numbers get update.

Originally when an ordered extent finishes, we migrate the reserved
qgroup data space from extent_io tree to delayed ref head of the data
extent, expecting delayed ref will only be cleaned up at commit
transaction time.

However for small RAM machine, due to memory pressure dirty pages can be
flushed back to disk without committing a transaction.

The  related events will be something like:

  BTRFS info (device dm-3): has skinny extents
  file 1 written
  btrfs_finish_ordered_io: ino=258 ordered offset=0 len=54947840
  btrfs_finish_ordered_io: ino=258 ordered offset=54947840 len=5636096
  btrfs_finish_ordered_io: ino=258 ordered offset=61153280 len=57344
  btrfs_finish_ordered_io: ino=258 ordered offset=61210624 len=8192
  btrfs_finish_ordered_io: ino=258 ordered offset=60583936 len=569344
  cleanup_ref_head: num_bytes=54947840
  cleanup_ref_head: num_bytes=5636096
  cleanup_ref_head: num_bytes=569344
  cleanup_ref_head: num_bytes=57344
  cleanup_ref_head: num_bytes=8192
   This will free qgroup data reserved space
  file 2 written
  ...
  file 8 written
  cleanup_ref_head: num_bytes=8192
  ...
  btrfs_commit_transaction  <<< the only transaction committed during
the test

When file 2 is written, we have already freed 128M reserved qgroup data
space for ino 258. Thus later write won't trigger EDQUOT.

This allows us to write more data beyond qgroup limit.

In my 2G ram VM, it could reach about 1.2G before hitting EDQUOT.

[FIX]
By moving reserved qgroup data space from btrfs_delayed_ref_head to
btrfs_qgroup_extent_record, we can ensure that reserved qgroup data
space won't be freed half way before commit transaction, thus fix the
problem.

Fixes: f64d5ca86821 ("btrfs: delayed_ref: Add new function to record reserved 
space into delayed ref")
Signed-off-by: Qu Wenruo 
---
I tried to create a dedicated test case for this, however drop_caches
interface would also trigger transaction commit, thus allowing bad kernel
to pass (as transaction commit updates qgroup numbers, killing the leakage).

Any idea to trigger dirty data pages writeback while without triggering
a transaction is welcomed.
---
 fs/btrfs/delayed-ref.c   | 15 ---
 fs/btrfs/delayed-ref.h   | 11 ---
 fs/btrfs/extent-tree.c   |  3 ---
 fs/btrfs/qgroup.c| 19 +++
 fs/btrfs/qgroup.h| 20 +++-
 include/trace/events/btrfs.h | 29 -
 6 files changed, 30 insertions(+), 67 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 9301b3ad9217..7e74bf42f2db 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -581,17 +581,14 @@ static void init_delayed_ref_head(struct 
btrfs_delayed_ref_head *head_ref,
RB_CLEAR_NODE(_ref->href_node);
head_ref->processing = 0;
head_ref->total_ref_mod = count_mod;
-   head_ref->qgroup_reserved = 0;
-   head_ref->qgroup_ref_root = 0;
spin_lock_init(_ref->lock);
mutex_init(_ref->mutex);
 
if (qrecord) {
if (ref_root && reserved) {
-   head_ref->qgroup_ref_root = ref_root;
-   head_ref->qgroup_reserved = reserved;
+   qrecord->data_rsv = reserved;
+   qrecord->data_rsv_refroot = ref_root;
}
-
qrecord->bytenr = bytenr;
qrecord->num_bytes = 

[PATCH 4/9 v2.1] btrfs: fix UAF due to race between replace start and cancel

2018-11-13 Thread Anand Jain
replace cancel thread can race with the replace start thread and if
fs_info::scrubs_running is not yet set the btrfs_scrub_cancel() will fail
to stop the scrub thread, so the scrub thread continue with the scrub for
replace which then shall try to write to the target device and which is
already freed by the cancel thread. Please ref to the logs below.

So scrub_setup_ctx() warns as tgtdev is null.

static noinline_for_stack
struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int
is_dev_replace)
{
::
if (is_dev_replace) {
WARN_ON(!fs_info->dev_replace.tgtdev);  <===
sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
sctx->flush_all_writes = false;
}

[ 6724.497655] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to 
/dev/sdc started
[ 6753.945017] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to 
/dev/sdc canceled
[ 6852.426700] WARNING: CPU: 0 PID: 4494 at fs/btrfs/scrub.c:622 
scrub_setup_ctx.isra.19+0x220/0x230 [btrfs]
::
[ 6852.428928] RIP: 0010:scrub_setup_ctx.isra.19+0x220/0x230 [btrfs]
::
[ 6852.432970] Call Trace:
[ 6852.433202]  btrfs_scrub_dev+0x19b/0x5c0 [btrfs]
[ 6852.433471]  btrfs_dev_replace_start+0x48c/0x6a0 [btrfs]
[ 6852.433800]  btrfs_dev_replace_by_ioctl+0x3a/0x60 [btrfs]
[ 6852.434097]  btrfs_ioctl+0x2476/0x2d20 [btrfs]
[ 6852.434365]  ? do_sigaction+0x7d/0x1e0
[ 6852.434623]  do_vfs_ioctl+0xa9/0x6c0
[ 6852.434865]  ? syscall_trace_enter+0x1c8/0x310
[ 6852.435124]  ? syscall_trace_enter+0x1c8/0x310
[ 6852.435387]  ksys_ioctl+0x60/0x90
[ 6852.435663]  __x64_sys_ioctl+0x16/0x20
[ 6852.435907]  do_syscall_64+0x50/0x180
[ 6852.436150]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

Further, as the replace thread enters the
scrub_write_page_to_dev_replace() without the target device it panics

static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
struct scrub_page *spage)
{
::
  bio_set_dev(bio, sbio->dev->bdev); <==

[ 6929.715145] BUG: unable to handle kernel NULL pointer dereference at 
00a0
::
[ 6929.717106] Workqueue: btrfs-scrub btrfs_scrub_helper [btrfs]
[ 6929.717420] RIP: 0010:scrub_write_page_to_dev_replace+0xb4/0x260
[btrfs]
::
[ 6929.721430] Call Trace:
[ 6929.721663]  scrub_write_block_to_dev_replace+0x3f/0x60 [btrfs]
[ 6929.721975]  scrub_bio_end_io_worker+0x1af/0x490 [btrfs]
[ 6929.722277]  normal_work_helper+0xf0/0x4c0 [btrfs]
[ 6929.722552]  process_one_work+0x1f4/0x520
[ 6929.722805]  ? process_one_work+0x16e/0x520
[ 6929.723063]  worker_thread+0x46/0x3d0
[ 6929.723313]  kthread+0xf8/0x130
[ 6929.723544]  ? process_one_work+0x520/0x520
[ 6929.723800]  ? kthread_delayed_work_timer_fn+0x80/0x80
[ 6929.724081]  ret_from_fork+0x3a/0x50

Fix this by letting the btrfs_dev_replace_finishing() to do the job of
cleaning after the cancel, including freeing of the target device.
btrfs_dev_replace_finishing() is called when btrfs_scub_dev() returns
along with the scrub return status.

Signed-off-by: Anand Jain 
---
v2->v2.1: Fix compiler warning. (I couldn't reproduce on gcc 4.8.5)

fs/btrfs/dev-replace.c: In function ‘btrfs_dev_replace_cancel’:
fs/btrfs/dev-replace.c:865:9: warning: ‘result’ may be used uninitialized in 
this function [-Wmaybe-uninitialized]
  return result;
 ^~

v1->v2: pls ref to the cover letter

 fs/btrfs/dev-replace.c | 61 --
 1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 35ce10f18607..d32d696d931c 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -797,39 +797,56 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info 
*fs_info)
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
btrfs_dev_replace_write_unlock(dev_replace);
-   goto leave;
+   break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+   result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+   tgt_device = dev_replace->tgtdev;
+   src_device = dev_replace->srcdev;
+   btrfs_dev_replace_write_unlock(dev_replace);
+   btrfs_scrub_cancel(fs_info);
+   /*
+* btrfs_dev_replace_finishing() will handle the cleanup part
+*/
+   btrfs_info_in_rcu(fs_info,
+   "dev_replace from %s (devid %llu) to %s canceled",
+   btrfs_dev_name(src_device), src_device->devid,
+   btrfs_dev_name(tgt_device));
+   break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+   /*
+* scrub doing the replace isn't running so do the cleanup here
+*/
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;

Re: [PATCH v15.1 00/13] Btrfs In-band De-duplication

2018-11-13 Thread Lu Fengqi
On Tue, Nov 13, 2018 at 02:45:45PM +0100, David Sterba wrote:
>On Tue, Nov 06, 2018 at 02:41:09PM +0800, Lu Fengqi wrote:
>> This patchset can be fetched from github:
>> https://github.com/littleroad/linux.git dedupe_latest
>> 
>> Now the new base is v4.20-rc1.
>
>Before anybody spends more time with this patchset: this is a big
>feature and quite intrusive to several btrfs subsystems. Currently it's
>on hold as it requires finishing the design phase, it's still only the
>in-memory backend and before we claim in-band dedupe, the persistent
>hash tree needs to be at least drafted or prototyped.

Thanks for your explanation. However, I'm not sure why we need to draft
a prototype of the persistent hash tree first when we are talking about
the memory backend.

-- 
Thanks,
Lu

>
>At this point there are several features that are in a more complete
>state so they get preferred when it comes to merging. I would have to
>look up what was agreed long time ago as merging plan, but at this point
>this series would require a lot of work.
>
>




Re: [PATCH 4/9] btrfs: fix UAF due to race between replace start and cancel

2018-11-13 Thread Anand Jain




On 11/14/2018 01:24 AM, David Sterba wrote:

On Sun, Nov 11, 2018 at 10:22:19PM +0800, Anand Jain wrote:

replace cancel thread can race with the replace start thread and if
fs_info::scrubs_running is not yet set the btrfs_scrub_cancel() will fail
to stop the scrub thread, so the scrub thread continue with the scrub for
replace which then shall try to write to the target device and which is
already freed by the cancel thread. Please ref to the logs below.

So scrub_setup_ctx() warns as tgtdev is null.

static noinline_for_stack
struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int
is_dev_replace)
{
::
 if (is_dev_replace) {
 WARN_ON(!fs_info->dev_replace.tgtdev);  <===
 sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
 sctx->flush_all_writes = false;
 }

[ 6724.497655] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to 
/dev/sdc started
[ 6753.945017] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to 
/dev/sdc canceled
[ 6852.426700] WARNING: CPU: 0 PID: 4494 at fs/btrfs/scrub.c:622 
scrub_setup_ctx.isra.19+0x220/0x230 [btrfs]
::
[ 6852.428928] RIP: 0010:scrub_setup_ctx.isra.19+0x220/0x230 [btrfs]
::
[ 6852.432970] Call Trace:
[ 6852.433202]  btrfs_scrub_dev+0x19b/0x5c0 [btrfs]
[ 6852.433471]  btrfs_dev_replace_start+0x48c/0x6a0 [btrfs]
[ 6852.433800]  btrfs_dev_replace_by_ioctl+0x3a/0x60 [btrfs]
[ 6852.434097]  btrfs_ioctl+0x2476/0x2d20 [btrfs]
[ 6852.434365]  ? do_sigaction+0x7d/0x1e0
[ 6852.434623]  do_vfs_ioctl+0xa9/0x6c0
[ 6852.434865]  ? syscall_trace_enter+0x1c8/0x310
[ 6852.435124]  ? syscall_trace_enter+0x1c8/0x310
[ 6852.435387]  ksys_ioctl+0x60/0x90
[ 6852.435663]  __x64_sys_ioctl+0x16/0x20
[ 6852.435907]  do_syscall_64+0x50/0x180
[ 6852.436150]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

Further, as the replace thread enters the
scrub_write_page_to_dev_replace() without the target device it panics

static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 struct scrub_page *spage)
{
::
   bio_set_dev(bio, sbio->dev->bdev); <==

[ 6929.715145] BUG: unable to handle kernel NULL pointer dereference at 
00a0
::
[ 6929.717106] Workqueue: btrfs-scrub btrfs_scrub_helper [btrfs]
[ 6929.717420] RIP: 0010:scrub_write_page_to_dev_replace+0xb4/0x260
[btrfs]
::
[ 6929.721430] Call Trace:
[ 6929.721663]  scrub_write_block_to_dev_replace+0x3f/0x60 [btrfs]
[ 6929.721975]  scrub_bio_end_io_worker+0x1af/0x490 [btrfs]
[ 6929.722277]  normal_work_helper+0xf0/0x4c0 [btrfs]
[ 6929.722552]  process_one_work+0x1f4/0x520
[ 6929.722805]  ? process_one_work+0x16e/0x520
[ 6929.723063]  worker_thread+0x46/0x3d0
[ 6929.723313]  kthread+0xf8/0x130
[ 6929.723544]  ? process_one_work+0x520/0x520
[ 6929.723800]  ? kthread_delayed_work_timer_fn+0x80/0x80
[ 6929.724081]  ret_from_fork+0x3a/0x50

Fix this by letting the btrfs_dev_replace_finishing() to do the job of
cleaning after the cancel, including freeing of the target device.
btrfs_dev_replace_finishing() is called when btrfs_scub_dev() returns
along with the scrub return status.

Signed-off-by: Anand Jain 
---
  fs/btrfs/dev-replace.c | 61 --
  1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 35ce10f18607..d32d696d931c 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -797,39 +797,56 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info 
*fs_info)
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
btrfs_dev_replace_write_unlock(dev_replace);
-   goto leave;
+   break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+   result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+   tgt_device = dev_replace->tgtdev;
+   src_device = dev_replace->srcdev;
+   btrfs_dev_replace_write_unlock(dev_replace);
+   btrfs_scrub_cancel(fs_info);
+   /*
+* btrfs_dev_replace_finishing() will handle the cleanup part
+*/
+   btrfs_info_in_rcu(fs_info,
+   "dev_replace from %s (devid %llu) to %s canceled",
+   btrfs_dev_name(src_device), src_device->devid,
+   btrfs_dev_name(tgt_device));
+   break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+   /*
+* scrub doing the replace isn't running so do the cleanup here
+*/
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
tgt_device = dev_replace->tgtdev;
src_device = dev_replace->srcdev;
dev_replace->tgtdev = NULL;
dev_replace->srcdev = NULL;
-   break;
-   

Re: [PATCH V7] Btrfs: enhance raid1/10 balance heuristic

2018-11-13 Thread Anand Jain




I am ok with the least used path approach here for the IO routing
that's probably most reasonable in generic configurations. It can
be default read mirror policy as well.

But as I mentioned. Not all configurations would agree to the heuristic
approach here. For example: To make use of the SAN storage cache to
get high IO throughput read must access based on the LBA, And this
heuristic would make matter worst. There are plans to add more
options read_mirror_policy [1].

[1]
https://patchwork.kernel.org/patch/10403299/

I would rather provide the configuration tune-ables to the use
cases rather than fixing it using heuristic. Heuristic are good
only with the known set of IO pattern for which heuristic is
designed for.

This is not the first time you are assuming heuristic would provide
the best possible performance in all use cases. As I mentioned
in the compression heuristic there was no problem statement that
you wanted to address using heuristic, theoretically the integrated
compression heuristic would have to do a lot more computation when
all the file-extents are compressible, its not convenience to me
how compression heuristic would help on a desktop machine where
most of the files are compressible.

IMO heuristic are good only for a set of types of workload. Giving
an option to move away from it for the manual tuning is desired.

Thanks, Anand


On 11/12/2018 07:58 PM, Timofey Titovets wrote:

From: Timofey Titovets 

Currently btrfs raid1/10 balancer bаlance requests to mirrors,
based on pid % num of mirrors.

Make logic understood:
  - if one of underline devices are non rotational
  - Queue length to underline devices

By default try use pid % num_mirrors guessing, but:
  - If one of mirrors are non rotational, repick optimal to it
  - If underline mirror have less queue length then optimal,
repick to that mirror

For avoid round-robin request balancing,
lets round down queue length:
  - By 8 for rotational devs
  - By 2 for all non rotational devs

Some bench results from mail list
(Dmitrii Tcvetkov ):
Benchmark summary (arithmetic mean of 3 runs):
  Mainline Patch

RAID1  | 18.9 MiB/s | 26.5 MiB/s
RAID10 | 30.7 MiB/s | 30.7 MiB/s

mainline, fio got lucky to read from first HDD (quite slow HDD):
Jobs: 1 (f=1): [r(1)][100.0%][r=8456KiB/s,w=0KiB/s][r=264,w=0 IOPS]
   read: IOPS=265, BW=8508KiB/s (8712kB/s)(499MiB/60070msec)
   lat (msec): min=2, max=825, avg=60.17, stdev=65.06

mainline, fio got lucky to read from second HDD (much more modern):
Jobs: 1 (f=1): [r(1)][8.7%][r=11.9MiB/s,w=0KiB/s][r=380,w=0 IOPS]
   read: IOPS=378, BW=11.8MiB/s (12.4MB/s)(710MiB/60051msec)
   lat (usec): min=416, max=644286, avg=42312.74, stdev=48518.56

mainline, fio got lucky to read from an SSD:
Jobs: 1 (f=1): [r(1)][100.0%][r=436MiB/s,w=0KiB/s][r=13.9k,w=0 IOPS]
   read: IOPS=13.9k, BW=433MiB/s (454MB/s)(25.4GiB/60002msec)
   lat (usec): min=343, max=16319, avg=1152.52, stdev=245.36

With the patch, 2 HDDs:
Jobs: 1 (f=1): [r(1)][100.0%][r=17.5MiB/s,w=0KiB/s][r=560,w=0 IOPS]
   read: IOPS=560, BW=17.5MiB/s (18.4MB/s)(1053MiB/60052msec)
   lat (usec): min=435, max=341037, avg=28511.64, stdev=3.14

With the patch, HDD(old one)+SSD:
Jobs: 1 (f=1): [r(1)][100.0%][r=371MiB/s,w=0KiB/s][r=11.9k,w=0 IOPS]
   read: IOPS=11.6k, BW=361MiB/s (379MB/s)(21.2GiB/60084msec)
   lat  (usec): min=363, max=346752, avg=1381.73, stdev=6948.32

Changes:
   v1 -> v2:
 - Use helper part_in_flight() from genhd.c
   to get queue length
 - Move guess code to guess_optimal()
 - Change balancer logic, try use pid % mirror by default
   Make balancing on spinning rust if one of underline devices
   are overloaded
   v2 -> v3:
 - Fix arg for RAID10 - use sub_stripes, instead of num_stripes
   v3 -> v4:
 - Rebased on latest misc-next
   v4 -> v5:
 - Rebased on latest misc-next
   v5 -> v6:
 - Fix spelling
 - Include bench results
   v6 -> v7:
 - Fixes based on Nikolay Borisov review:
   * Assume num == 2
   * Remove "for" loop based on that assumption, where possible
   * No functional changes

Signed-off-by: Timofey Titovets 
Tested-by: Dmitrii Tcvetkov 
Reviewed-by: Dmitrii Tcvetkov 
---
  block/genhd.c  |   1 +
  fs/btrfs/volumes.c | 100 -
  2 files changed, 100 insertions(+), 1 deletion(-)

diff --git a/block/genhd.c b/block/genhd.c
index be5bab20b2ab..939f0c6a2d79 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -81,6 +81,7 @@ void part_in_flight(struct request_queue *q, struct hd_struct 
*part,
 

[PATCH V8] Btrfs: enhance raid1/10 balance heuristic

2018-11-13 Thread Timofey Titovets
From: Timofey Titovets 

Currently btrfs raid1/10 balancer bаlance requests to mirrors,
based on pid % num of mirrors.

Make logic understood:
 - if one of underline devices are non rotational
 - Queue length to underline devices

By default try use pid % num_mirrors guessing, but:
 - If one of mirrors are non rotational, repick optimal to it
 - If underline mirror have less queue length then optimal,
   repick to that mirror

For avoid round-robin request balancing,
lets round down queue length:
 - By 8 for rotational devs
 - By 2 for all non rotational devs

Some bench results from mail list
(Dmitrii Tcvetkov ):
Benchmark summary (arithmetic mean of 3 runs):
 Mainline Patch

RAID1  | 18.9 MiB/s | 26.5 MiB/s
RAID10 | 30.7 MiB/s | 30.7 MiB/s

mainline, fio got lucky to read from first HDD (quite slow HDD):
Jobs: 1 (f=1): [r(1)][100.0%][r=8456KiB/s,w=0KiB/s][r=264,w=0 IOPS]
  read: IOPS=265, BW=8508KiB/s (8712kB/s)(499MiB/60070msec)
  lat (msec): min=2, max=825, avg=60.17, stdev=65.06

mainline, fio got lucky to read from second HDD (much more modern):
Jobs: 1 (f=1): [r(1)][8.7%][r=11.9MiB/s,w=0KiB/s][r=380,w=0 IOPS]
  read: IOPS=378, BW=11.8MiB/s (12.4MB/s)(710MiB/60051msec)
  lat (usec): min=416, max=644286, avg=42312.74, stdev=48518.56

mainline, fio got lucky to read from an SSD:
Jobs: 1 (f=1): [r(1)][100.0%][r=436MiB/s,w=0KiB/s][r=13.9k,w=0 IOPS]
  read: IOPS=13.9k, BW=433MiB/s (454MB/s)(25.4GiB/60002msec)
  lat (usec): min=343, max=16319, avg=1152.52, stdev=245.36

With the patch, 2 HDDs:
Jobs: 1 (f=1): [r(1)][100.0%][r=17.5MiB/s,w=0KiB/s][r=560,w=0 IOPS]
  read: IOPS=560, BW=17.5MiB/s (18.4MB/s)(1053MiB/60052msec)
  lat (usec): min=435, max=341037, avg=28511.64, stdev=3.14

With the patch, HDD(old one)+SSD:
Jobs: 1 (f=1): [r(1)][100.0%][r=371MiB/s,w=0KiB/s][r=11.9k,w=0 IOPS]
  read: IOPS=11.6k, BW=361MiB/s (379MB/s)(21.2GiB/60084msec)
  lat  (usec): min=363, max=346752, avg=1381.73, stdev=6948.32

Changes:
  v1 -> v2:
- Use helper part_in_flight() from genhd.c
  to get queue length
- Move guess code to guess_optimal()
- Change balancer logic, try use pid % mirror by default
  Make balancing on spinning rust if one of underline devices
  are overloaded
  v2 -> v3:
- Fix arg for RAID10 - use sub_stripes, instead of num_stripes
  v3 -> v4:
- Rebased on latest misc-next
  v4 -> v5:
- Rebased on latest misc-next
  v5 -> v6:
- Fix spelling
- Include bench results
  v6 -> v7:
- Fixes based on Nikolay Borisov review:
  * Assume num == 2
  * Remove "for" loop based on that assumption, where possible
  v7 -> v8:
- Add comment about magic '2' num in guess function

Signed-off-by: Timofey Titovets 
Tested-by: Dmitrii Tcvetkov 
Reviewed-by: Dmitrii Tcvetkov 
---
 block/genhd.c  |   1 +
 fs/btrfs/volumes.c | 104 -
 2 files changed, 104 insertions(+), 1 deletion(-)

diff --git a/block/genhd.c b/block/genhd.c
index cff6bdf27226..4ba5ede8969e 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -81,6 +81,7 @@ void part_in_flight(struct request_queue *q, struct hd_struct 
*part,
atomic_read(>in_flight[1]);
}
 }
+EXPORT_SYMBOL_GPL(part_in_flight);
 
 void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
   unsigned int inflight[2])
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f435d397019e..d9b5cf31514a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include "ctree.h"
 #include "extent_map.h"
@@ -28,6 +29,8 @@
 #include "dev-replace.h"
 #include "sysfs.h"
 
+#define BTRFS_RAID_1_10_MAX_MIRRORS 2
+
 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
.sub_stripes= 2,
@@ -5166,6 +5169,104 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info 
*fs_info, u64 logical, u64 len)
return ret;
 }
 
+/**
+ * bdev_get_queue_len - return rounded down in flight queue length of bdev
+ *
+ * @bdev: target bdev
+ * @round_down: round factor big for hdd and small for ssd, like 8 and 2
+ */
+static int bdev_get_queue_len(struct block_device *bdev, int round_down)
+{
+   int sum;
+   struct hd_struct *bd_part = bdev->bd_part;
+   struct request_queue *rq = bdev_get_queue(bdev);
+   uint32_t inflight[2] = {0, 0};
+
+   part_in_flight(rq, bd_part, inflight);
+
+   sum = max_t(uint32_t, inflight[0], inflight[1]);
+
+   /*
+* Try prevent switch for 

Re: [PATCH v2 0/6] btrfs: qgroup: Delay subtree scan to reduce overhead

2018-11-13 Thread Qu Wenruo


On 2018/11/14 上午1:58, Filipe Manana wrote:
> On Tue, Nov 13, 2018 at 5:08 PM David Sterba  wrote:
>>
>> On Mon, Nov 12, 2018 at 10:33:33PM +0100, David Sterba wrote:
>>> On Thu, Nov 08, 2018 at 01:49:12PM +0800, Qu Wenruo wrote:
 This patchset can be fetched from github:
 https://github.com/adam900710/linux/tree/qgroup_delayed_subtree_rebased

 Which is based on v4.20-rc1.
>>>
>>> Thanks, I'll add it to for-next soon.
>>
>> During test generic/517, the logs were full of the warning below. The 
>> reference
>> test on current master, effectively misc-4.20 which was used as base of your
>> branch did not get the warning.
>>
>> [11540.167829] BTRFS: end < start 2519039 2519040
>> [11540.170513] WARNING: CPU: 1 PID: 539 at fs/btrfs/extent_io.c:436 
>> insert_state+0xd8/0x100 [btrfs]
>> [11540.174411] Modules linked in: dm_thin_pool dm_persistent_data dm_bufio 
>> dm_bio_prison btrfs libcrc32c xor zstd_decompress zstd_compress xxhash 
>> raid6_pq dm_mod loop [last unloaded: libcrc32c]
>> [11540.178279] CPU: 1 PID: 539 Comm: xfs_io Tainted: G  D W 
>> 4.20.0-rc1-default+ #329
>> [11540.180616] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
>> rel-1.11.2-0-gf9626cc-prebuilt.qemu-project.org 04/01/2014
>> [11540.183754] RIP: 0010:insert_state+0xd8/0x100 [btrfs]
>> [11540.189173] RSP: 0018:a0d245eafb20 EFLAGS: 00010282
>> [11540.189885] RAX:  RBX: 9f0bb3267320 RCX: 
>> 
>> [11540.191646] RDX: 0002 RSI: 0001 RDI: 
>> a40c400d
>> [11540.192942] RBP: 00266fff R08: 0001 R09: 
>> 
>> [11540.193871] R10:  R11: a629da2d R12: 
>> 9f0ba0281c60
>> [11540.195527] R13: 00267000 R14: a0d245eafb98 R15: 
>> a0d245eafb90
>> [11540.197026] FS:  7fa338eb4b80() GS:9f0bbd60() 
>> knlGS:
>> [11540.198251] CS:  0010 DS:  ES:  CR0: 80050033
>> [11540.199698] CR2: 7fa33873bfb8 CR3: 6fb6e000 CR4: 
>> 06e0
>> [11540.201428] Call Trace:
>> [11540.202164]  __set_extent_bit+0x43b/0x5b0 [btrfs]
>> [11540.203223]  lock_extent_bits+0x5d/0x210 [btrfs]
>> [11540.204346]  ? _raw_spin_unlock+0x24/0x40
>> [11540.205381]  ? test_range_bit+0xdf/0x130 [btrfs]
>> [11540.206573]  lock_extent_range+0xb8/0x150 [btrfs]
>> [11540.207696]  btrfs_double_extent_lock+0x78/0xb0 [btrfs]
>> [11540.208988]  btrfs_extent_same_range+0x131/0x4e0 [btrfs]
>> [11540.210237]  btrfs_remap_file_range+0x337/0x350 [btrfs]
>> [11540.211448]  vfs_dedupe_file_range_one+0x141/0x150
>> [11540.212622]  vfs_dedupe_file_range+0x146/0x1a0
>> [11540.213795]  do_vfs_ioctl+0x520/0x6c0
>> [11540.214711]  ? __fget+0x109/0x1e0
>> [11540.215616]  ksys_ioctl+0x3a/0x70
>> [11540.216233]  __x64_sys_ioctl+0x16/0x20
>> [11540.216860]  do_syscall_64+0x54/0x180
>> [11540.217409]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
>> [11540.218126] RIP: 0033:0x7fa338a4daa7
> 
> That's the infinite loop issue fixed by one of the patches submitted
> for 4.20-rc2:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v4.20-rc2=11023d3f5fdf89bba5e1142127701ca6e6014587
> 
> The branch you used for testing doesn't have that fix?

Yep, I tried v4.20-rc1 tag, which hits tons of such warning even without
my patchset.

So it shouldn't be my patches causing the problem.

Thanks,
Qu

> 
>>
> 
> 



signature.asc
Description: OpenPGP digital signature


Re: [PATCH V7] Btrfs: enhance raid1/10 balance heuristic

2018-11-13 Thread Goffredo Baroncelli
On 12/11/2018 12.58, Timofey Titovets wrote:
> From: Timofey Titovets 
> 
> Currently btrfs raid1/10 balancer bаlance requests to mirrors,
> based on pid % num of mirrors.
[...]
>   v6 -> v7:
> - Fixes based on Nikolay Borisov review:
>   * Assume num == 2
>   * Remove "for" loop based on that assumption, where possible
>   * No functional changes
> 
> Signed-off-by: Timofey Titovets 
> Tested-by: Dmitrii Tcvetkov 
> Reviewed-by: Dmitrii Tcvetkov 
> ---
[...]
> +/**
> + * guess_optimal - return guessed optimal mirror
> + *
> + * Optimal expected to be pid % num_stripes
> + *
> + * That's generaly ok for spread load
> + * Add some balancer based on queue length to device
> + *
> + * Basic ideas:
> + *  - Sequential read generate low amount of request
> + *so if load of drives are equal, use pid % num_stripes balancing
> + *  - For mixed rotate/non-rotate mirrors, pick non-rotate as optimal
> + *and repick if other dev have "significant" less queue length
> + *  - Repick optimal if queue length of other mirror are less
> + */
> +static int guess_optimal(struct map_lookup *map, int num, int optimal)
> +{
> + int i;
> + int round_down = 8;
> + /* Init for missing bdevs */
> + int qlen[2] = { INT_MAX, INT_MAX };
> + bool is_nonrot[2] = { false, false };
> + bool all_bdev_nonrot = true;
> + bool all_bdev_rotate = true;
> + struct block_device *bdev;
> +
> + ASSERT(num == 2);
> +
> + /* Check accessible bdevs */> + for (i = 0; i < 2; i++) {

>From your function comment, it is not clear why you are comparing "num" to 
>"2". Pay attention that there are somewhere some patches which implement raid 
>profile with higher redundancy (IIRC up to 4). I suggest to put your 
>assumption also in the comment ("...this function works up to 2 mirrors...") 
>and, better, add a define like 

#define BTRFS_MAX_RAID1_RAID10_MIRRORS 2

And replace "2" with BTRFS_MAX_RAID1_RAID10_MIRRORS



> + bdev = map->stripes[i].dev->bdev;
> + if (bdev) {
> + qlen[i] = 0;
> + is_nonrot[i] = blk_queue_nonrot(bdev_get_queue(bdev));
> + if (is_nonrot[i])
> + all_bdev_rotate = false;
> + else
> + all_bdev_nonrot = false;
> + }
> + }
> +
> + /*
> +  * Don't bother with computation
> +  * if only one of two bdevs are accessible
> +  */
> + if (qlen[0] == INT_MAX)
> + return 1;
> + if (qlen[1] == INT_MAX)
> + return 0;
> +
> + if (all_bdev_nonrot)
> + round_down = 2;
> +
> + for (i = 0; i < 2; i++) {
> + bdev = map->stripes[i].dev->bdev;
> + qlen[i] = bdev_get_queue_len(bdev, round_down);
> + }
> +
> + /* For mixed case, pick non rotational dev as optimal */
> + if (all_bdev_rotate == all_bdev_nonrot) {
> + if (is_nonrot[0])
> + optimal = 0;
> + else
> + optimal = 1;
> + }
> +
> + if (qlen[optimal] > qlen[(optimal + 1) % 2])
> + optimal = i;
> +
> + return optimal;
> +}
> +
>  static int find_live_mirror(struct btrfs_fs_info *fs_info,
>   struct map_lookup *map, int first,
>   int dev_replace_is_ongoing)
> @@ -5177,7 +5274,8 @@ static int find_live_mirror(struct btrfs_fs_info 
> *fs_info,
>   else
>   num_stripes = map->num_stripes;
>  
> - preferred_mirror = first + current->pid % num_stripes;
> + preferred_mirror = first + guess_optimal(map, num_stripes,
> +  current->pid % num_stripes);
>  
>   if (dev_replace_is_ongoing &&
>   fs_info->dev_replace.cont_reading_from_srcdev_mode ==
> 


-- 
gpg @keyserver.linux.it: Goffredo Baroncelli 
Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5


Re: [PATCH v2 0/6] btrfs: qgroup: Delay subtree scan to reduce overhead

2018-11-13 Thread Filipe Manana
On Tue, Nov 13, 2018 at 5:08 PM David Sterba  wrote:
>
> On Mon, Nov 12, 2018 at 10:33:33PM +0100, David Sterba wrote:
> > On Thu, Nov 08, 2018 at 01:49:12PM +0800, Qu Wenruo wrote:
> > > This patchset can be fetched from github:
> > > https://github.com/adam900710/linux/tree/qgroup_delayed_subtree_rebased
> > >
> > > Which is based on v4.20-rc1.
> >
> > Thanks, I'll add it to for-next soon.
>
> During test generic/517, the logs were full of the warning below. The 
> reference
> test on current master, effectively misc-4.20 which was used as base of your
> branch did not get the warning.
>
> [11540.167829] BTRFS: end < start 2519039 2519040
> [11540.170513] WARNING: CPU: 1 PID: 539 at fs/btrfs/extent_io.c:436 
> insert_state+0xd8/0x100 [btrfs]
> [11540.174411] Modules linked in: dm_thin_pool dm_persistent_data dm_bufio 
> dm_bio_prison btrfs libcrc32c xor zstd_decompress zstd_compress xxhash 
> raid6_pq dm_mod loop [last unloaded: libcrc32c]
> [11540.178279] CPU: 1 PID: 539 Comm: xfs_io Tainted: G  D W 
> 4.20.0-rc1-default+ #329
> [11540.180616] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> rel-1.11.2-0-gf9626cc-prebuilt.qemu-project.org 04/01/2014
> [11540.183754] RIP: 0010:insert_state+0xd8/0x100 [btrfs]
> [11540.189173] RSP: 0018:a0d245eafb20 EFLAGS: 00010282
> [11540.189885] RAX:  RBX: 9f0bb3267320 RCX: 
> 
> [11540.191646] RDX: 0002 RSI: 0001 RDI: 
> a40c400d
> [11540.192942] RBP: 00266fff R08: 0001 R09: 
> 
> [11540.193871] R10:  R11: a629da2d R12: 
> 9f0ba0281c60
> [11540.195527] R13: 00267000 R14: a0d245eafb98 R15: 
> a0d245eafb90
> [11540.197026] FS:  7fa338eb4b80() GS:9f0bbd60() 
> knlGS:
> [11540.198251] CS:  0010 DS:  ES:  CR0: 80050033
> [11540.199698] CR2: 7fa33873bfb8 CR3: 6fb6e000 CR4: 
> 06e0
> [11540.201428] Call Trace:
> [11540.202164]  __set_extent_bit+0x43b/0x5b0 [btrfs]
> [11540.203223]  lock_extent_bits+0x5d/0x210 [btrfs]
> [11540.204346]  ? _raw_spin_unlock+0x24/0x40
> [11540.205381]  ? test_range_bit+0xdf/0x130 [btrfs]
> [11540.206573]  lock_extent_range+0xb8/0x150 [btrfs]
> [11540.207696]  btrfs_double_extent_lock+0x78/0xb0 [btrfs]
> [11540.208988]  btrfs_extent_same_range+0x131/0x4e0 [btrfs]
> [11540.210237]  btrfs_remap_file_range+0x337/0x350 [btrfs]
> [11540.211448]  vfs_dedupe_file_range_one+0x141/0x150
> [11540.212622]  vfs_dedupe_file_range+0x146/0x1a0
> [11540.213795]  do_vfs_ioctl+0x520/0x6c0
> [11540.214711]  ? __fget+0x109/0x1e0
> [11540.215616]  ksys_ioctl+0x3a/0x70
> [11540.216233]  __x64_sys_ioctl+0x16/0x20
> [11540.216860]  do_syscall_64+0x54/0x180
> [11540.217409]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [11540.218126] RIP: 0033:0x7fa338a4daa7

That's the infinite loop issue fixed by one of the patches submitted
for 4.20-rc2:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v4.20-rc2=11023d3f5fdf89bba5e1142127701ca6e6014587

The branch you used for testing doesn't have that fix?

>


-- 
Filipe David Manana,

“Whether you think you can, or you think you can't — you're right.”


Re: [PATCH 0/9 v2] fix replace-start and replace-cancel racing

2018-11-13 Thread David Sterba
On Sun, Nov 11, 2018 at 10:22:15PM +0800, Anand Jain wrote:
> v1->v2:
>   2/9: Drop writeback required
>   3/9: Drop writeback required
>   7/9: Use the condition within the WARN_ON()
>   6/9: Use the condition within the ASSERT()
> 
> Replace-start and replace-cancel threads can race to create a messy
> situation leading to UAF. We use the scrub code to write
> the blocks on the replace target. So if we haven't have set the
> replace-scrub-running yet, without this patch we just ignore the error
> and free the target device. When this happens the system panics with
> UAF error.
> 
> Its nice to see that btrfs_dev_replace_finishing() already handles
> the ECANCELED (replace canceled) situation, but for an unknown reason
> we aren't using it to cleanup the replace cancel situation, instead
> we just let the replace cancel ioctl thread to cleanup the target
> device and return and out of synchronous with the scrub code.
> 
> This patch 4/9, 5/9 and 6/9 uses the return code of btrfs_scrub_cancel()
> to check if the scrub was really running. And if its not then shall
> return an error to the user (replace not started error) so that user
> can retry replace cancel. And uses btrfs_dev_replace_finishing() code
> to cleanup after successful cancel of the replace scrub.
> 
> Further, a suspended replace, when tries to restart, and if it fails
> (for example target device missing, or excl ops running) it goes to the
> started state, and so the cli 'btrfs replace status /mnt' hangs with no
> progress. So patches 2/9 and 3/9 fixes that.
> 
> As the originals code idea of ECANCELED was limited to the situation of
> the error only and not user requested, there are unnecessary error log
> and warn log which 7/9 and 8/9 patches fixes.

This fixes quite a few problems, namely the crash in scrub_setup_ctx,
thanks. I'm going to add the patchset to for-next, the code looks good
on first glance.


Re: [PATCH 4/9] btrfs: fix UAF due to race between replace start and cancel

2018-11-13 Thread David Sterba
On Sun, Nov 11, 2018 at 10:22:19PM +0800, Anand Jain wrote:
> replace cancel thread can race with the replace start thread and if
> fs_info::scrubs_running is not yet set the btrfs_scrub_cancel() will fail
> to stop the scrub thread, so the scrub thread continue with the scrub for
> replace which then shall try to write to the target device and which is
> already freed by the cancel thread. Please ref to the logs below.
> 
> So scrub_setup_ctx() warns as tgtdev is null.
> 
> static noinline_for_stack
> struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int
> is_dev_replace)
> {
> ::
> if (is_dev_replace) {
> WARN_ON(!fs_info->dev_replace.tgtdev);  <===
> sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
> sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
> sctx->flush_all_writes = false;
> }
> 
> [ 6724.497655] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) 
> to /dev/sdc started
> [ 6753.945017] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) 
> to /dev/sdc canceled
> [ 6852.426700] WARNING: CPU: 0 PID: 4494 at fs/btrfs/scrub.c:622 
> scrub_setup_ctx.isra.19+0x220/0x230 [btrfs]
> ::
> [ 6852.428928] RIP: 0010:scrub_setup_ctx.isra.19+0x220/0x230 [btrfs]
> ::
> [ 6852.432970] Call Trace:
> [ 6852.433202]  btrfs_scrub_dev+0x19b/0x5c0 [btrfs]
> [ 6852.433471]  btrfs_dev_replace_start+0x48c/0x6a0 [btrfs]
> [ 6852.433800]  btrfs_dev_replace_by_ioctl+0x3a/0x60 [btrfs]
> [ 6852.434097]  btrfs_ioctl+0x2476/0x2d20 [btrfs]
> [ 6852.434365]  ? do_sigaction+0x7d/0x1e0
> [ 6852.434623]  do_vfs_ioctl+0xa9/0x6c0
> [ 6852.434865]  ? syscall_trace_enter+0x1c8/0x310
> [ 6852.435124]  ? syscall_trace_enter+0x1c8/0x310
> [ 6852.435387]  ksys_ioctl+0x60/0x90
> [ 6852.435663]  __x64_sys_ioctl+0x16/0x20
> [ 6852.435907]  do_syscall_64+0x50/0x180
> [ 6852.436150]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> 
> Further, as the replace thread enters the
> scrub_write_page_to_dev_replace() without the target device it panics
> 
> static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
> struct scrub_page *spage)
> {
> ::
>   bio_set_dev(bio, sbio->dev->bdev); <==
> 
> [ 6929.715145] BUG: unable to handle kernel NULL pointer dereference at 
> 00a0
> ::
> [ 6929.717106] Workqueue: btrfs-scrub btrfs_scrub_helper [btrfs]
> [ 6929.717420] RIP: 0010:scrub_write_page_to_dev_replace+0xb4/0x260
> [btrfs]
> ::
> [ 6929.721430] Call Trace:
> [ 6929.721663]  scrub_write_block_to_dev_replace+0x3f/0x60 [btrfs]
> [ 6929.721975]  scrub_bio_end_io_worker+0x1af/0x490 [btrfs]
> [ 6929.722277]  normal_work_helper+0xf0/0x4c0 [btrfs]
> [ 6929.722552]  process_one_work+0x1f4/0x520
> [ 6929.722805]  ? process_one_work+0x16e/0x520
> [ 6929.723063]  worker_thread+0x46/0x3d0
> [ 6929.723313]  kthread+0xf8/0x130
> [ 6929.723544]  ? process_one_work+0x520/0x520
> [ 6929.723800]  ? kthread_delayed_work_timer_fn+0x80/0x80
> [ 6929.724081]  ret_from_fork+0x3a/0x50
> 
> Fix this by letting the btrfs_dev_replace_finishing() to do the job of
> cleaning after the cancel, including freeing of the target device.
> btrfs_dev_replace_finishing() is called when btrfs_scub_dev() returns
> along with the scrub return status.
> 
> Signed-off-by: Anand Jain 
> ---
>  fs/btrfs/dev-replace.c | 61 
> --
>  1 file changed, 39 insertions(+), 22 deletions(-)
> 
> diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
> index 35ce10f18607..d32d696d931c 100644
> --- a/fs/btrfs/dev-replace.c
> +++ b/fs/btrfs/dev-replace.c
> @@ -797,39 +797,56 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info 
> *fs_info)
>   case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
>   result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
>   btrfs_dev_replace_write_unlock(dev_replace);
> - goto leave;
> + break;
>   case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
> + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
> + tgt_device = dev_replace->tgtdev;
> + src_device = dev_replace->srcdev;
> + btrfs_dev_replace_write_unlock(dev_replace);
> + btrfs_scrub_cancel(fs_info);
> + /*
> +  * btrfs_dev_replace_finishing() will handle the cleanup part
> +  */
> + btrfs_info_in_rcu(fs_info,
> + "dev_replace from %s (devid %llu) to %s canceled",
> + btrfs_dev_name(src_device), src_device->devid,
> + btrfs_dev_name(tgt_device));
> + break;
>   case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
> + /*
> +  * scrub doing the replace isn't running so do the cleanup here
> +  */
>   result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
>   tgt_device = dev_replace->tgtdev;
>   src_device = dev_replace->srcdev;
>  

Re: [PATCH v2 0/6] btrfs: qgroup: Delay subtree scan to reduce overhead

2018-11-13 Thread David Sterba
On Mon, Nov 12, 2018 at 10:33:33PM +0100, David Sterba wrote:
> On Thu, Nov 08, 2018 at 01:49:12PM +0800, Qu Wenruo wrote:
> > This patchset can be fetched from github:
> > https://github.com/adam900710/linux/tree/qgroup_delayed_subtree_rebased
> > 
> > Which is based on v4.20-rc1.
> 
> Thanks, I'll add it to for-next soon.

During test generic/517, the logs were full of the warning below. The reference
test on current master, effectively misc-4.20 which was used as base of your
branch did not get the warning.

[11540.167829] BTRFS: end < start 2519039 2519040
[11540.170513] WARNING: CPU: 1 PID: 539 at fs/btrfs/extent_io.c:436 
insert_state+0xd8/0x100 [btrfs]
[11540.174411] Modules linked in: dm_thin_pool dm_persistent_data dm_bufio 
dm_bio_prison btrfs libcrc32c xor zstd_decompress zstd_compress xxhash raid6_pq 
dm_mod loop [last unloaded: libcrc32c]
[11540.178279] CPU: 1 PID: 539 Comm: xfs_io Tainted: G  D W 
4.20.0-rc1-default+ #329
[11540.180616] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
rel-1.11.2-0-gf9626cc-prebuilt.qemu-project.org 04/01/2014
[11540.183754] RIP: 0010:insert_state+0xd8/0x100 [btrfs]
[11540.189173] RSP: 0018:a0d245eafb20 EFLAGS: 00010282
[11540.189885] RAX:  RBX: 9f0bb3267320 RCX: 
[11540.191646] RDX: 0002 RSI: 0001 RDI: a40c400d
[11540.192942] RBP: 00266fff R08: 0001 R09: 
[11540.193871] R10:  R11: a629da2d R12: 9f0ba0281c60
[11540.195527] R13: 00267000 R14: a0d245eafb98 R15: a0d245eafb90
[11540.197026] FS:  7fa338eb4b80() GS:9f0bbd60() 
knlGS:
[11540.198251] CS:  0010 DS:  ES:  CR0: 80050033
[11540.199698] CR2: 7fa33873bfb8 CR3: 6fb6e000 CR4: 06e0
[11540.201428] Call Trace:
[11540.202164]  __set_extent_bit+0x43b/0x5b0 [btrfs]
[11540.203223]  lock_extent_bits+0x5d/0x210 [btrfs]
[11540.204346]  ? _raw_spin_unlock+0x24/0x40
[11540.205381]  ? test_range_bit+0xdf/0x130 [btrfs]
[11540.206573]  lock_extent_range+0xb8/0x150 [btrfs]
[11540.207696]  btrfs_double_extent_lock+0x78/0xb0 [btrfs]
[11540.208988]  btrfs_extent_same_range+0x131/0x4e0 [btrfs]
[11540.210237]  btrfs_remap_file_range+0x337/0x350 [btrfs]
[11540.211448]  vfs_dedupe_file_range_one+0x141/0x150
[11540.212622]  vfs_dedupe_file_range+0x146/0x1a0
[11540.213795]  do_vfs_ioctl+0x520/0x6c0
[11540.214711]  ? __fget+0x109/0x1e0
[11540.215616]  ksys_ioctl+0x3a/0x70
[11540.216233]  __x64_sys_ioctl+0x16/0x20
[11540.216860]  do_syscall_64+0x54/0x180
[11540.217409]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[11540.218126] RIP: 0033:0x7fa338a4daa7



Re: [PATCH 0/6] Chunk allocator DUP fix and cleanups

2018-11-13 Thread Hans van Kranenburg
On 11/13/18 4:03 PM, David Sterba wrote:
> On Thu, Oct 11, 2018 at 07:40:22PM +, Hans van Kranenburg wrote:
>> On 10/11/2018 05:13 PM, David Sterba wrote:
>>> On Thu, Oct 04, 2018 at 11:24:37PM +0200, Hans van Kranenburg wrote:
 This patch set contains an additional fix for a newly exposed bug after
 the previous attempt to fix a chunk allocator bug for new DUP chunks:

 https://lore.kernel.org/linux-btrfs/782f6000-30c0-0085-abd2-74ec5827c...@mendix.com/T/#m609ccb5d32998e8ba5cfa9901c1ab56a38a6f374

 The DUP fix is "fix more DUP stripe size handling". I did that one
 before starting to change more things so it can be applied to earlier
 LTS kernels.

 Besides that patch, which is fixing the bug in a way that is least
 intrusive, I added a bunch of other patches to help getting the chunk
 allocator code in a state that is a bit less error-prone and
 bug-attracting.

 When running this and trying the reproduction scenario, I can now see
 that the created DUP device extent is 827326464 bytes long, which is
 good.

 I wrote and tested this on top of linus 4.19-rc5. I still need to create
 a list of related use cases and test more things to at least walk
 through a bunch of obvious use cases to see if there's nothing exploding
 too quickly with these changes. However, I'm quite confident about it,
 so I'm sharing all of it already.

 Any feedback and review is appreciated. Be gentle and keep in mind that
 I'm still very much in a learning stage regarding kernel development.
>>>
>>> The patches look good, thanks. Problem is explained, preparatory work is
>>> separated from the fix itself.
>>
>> \o/
>>
 The stable patches handling workflow is not 100% clear to me yet. I
 guess I have to add a Fixes: in the DUP patch which points to the
 previous commit 92e222df7b.
>>>
>>> Almost nobody does it right, no worries. If you can identify a single
>>> patch that introduces a bug then it's for Fixes:, otherwise a CC: stable
>>> with version where it makes sense & applies is enough. I do that check
>>> myself regardless of what's in the patch.
>>
>> It's 92e222df7b and the thing I'm not sure about is if it also will
>> catch the same patch which was already backported to LTS kernels since
>> 92e222df7b also has Fixes in it... So by now the new bug is in 4.19,
>> 4.14, 4.9, 4.4, 3.16...
>>
>>> I ran the patches in a VM and hit a division-by-zero in test
>>> fstests/btrfs/011, stacktrace below. First guess is that it's caused by
>>> patch 3/6.
>>
>> Ah interesting, dev replace.
>>
>> I'll play around with replace and find out how to run the tests properly
>> and then reproduce this.
>>
>> The code introduced in patch 3 is removed again in patch 6, so I don't
>> suspect that one. :)
>>
>> But, I'll find out.
>>
>> Thanks for testing.
> 
> I've merged patches 1-5 to misc-next as they seem to be safe and pass
> fstests, please let me know when you have updates to the last one.
> Thanks.

I'll definitely follow up on that. It has not happened because something
about todo lists and ordering work and not doing too many things at the
same time.

Thanks for already confirming it was not patch 3 but 6 that has the
problem, I already suspected that.

For patch 3, the actual fix, how do we get that on top of old stable
kernels where the previous fix (92e222df7b) is in? Because of the Fixes:
line that one was picked again in 4.14, 4.9, 4.4 and even 3.16. How does
this work? Does it need all the other commit ids from those branches in
Fixes lines? Or is there magic somewhere that does this?

From your "development cycle open" mails, I understand that for testing
myself, I would test based on misc-next, for-next or for-x.y in that
order depending on where the first 5 are yet at that moment?

Hans


Re: [PATCH RFC] btrfs: harden agaist duplicate fsid

2018-11-13 Thread Anand Jain




On 11/13/2018 11:31 PM, David Sterba wrote:

On Mon, Oct 01, 2018 at 09:31:04PM +0800, Anand Jain wrote:

+    /*
+ * we are going to replace the device path, make sure its the
+ * same device if the device mounted
+ */
+    if (device->bdev) {
+    struct block_device *path_bdev;
+
+    path_bdev = lookup_bdev(path);
+    if (IS_ERR(path_bdev)) {
+    mutex_unlock(_devices->device_list_mutex);
+    return ERR_CAST(path_bdev);
+    }
+
+    if (device->bdev != path_bdev) {
+    bdput(path_bdev);
+    mutex_unlock(_devices->device_list_mutex);
+    return ERR_PTR(-EEXIST);

It would be _really_ nice to have an informative error message printed
here.  Aside from the possibility of an admin accidentally making a
block-level copy of the volume,



this code triggering could represent an
attempted attack against the system, so it's arguably something that
should be reported as happening.



   Personally, I think a WARN_ON_ONCE for
this would make sense, ideally per-volume if possible.


   Ah. Will add an warn. Thanks, Anand


The requested error message is not in the patch you posted or I have
missed that (https://patchwork.kernel.org/patch/10641041/) .


 No you didn't miss. I missed it. When you integrated this into
 for-next, I should have sent out v2. Sorry. Thanks for taking
 care of it.


Austin, is the following ok for you?

   "BTRFS: duplicate device fsid:devid for %pU:%llu old:%s new:%s\n"

   BTRFS: duplicate device fsid:devid 7c667b96-59eb-43ad-9ae9-c878f6ad51d8:2 
old:/dev/sda6 new:/dev/sdb6

As the UUID and paths are long I tried to squeeeze the rest so it's
still comprehensible but this would be better confirmed. Thanks.


Thanks, Anand




Re: [PATCH RFC RESEND] btrfs: harden agaist duplicate fsid

2018-11-13 Thread Anand Jain




On 11/13/2018 11:21 PM, David Sterba wrote:

On Mon, Oct 15, 2018 at 10:45:17AM +0800, Anand Jain wrote:

(Thanks for the comments on requiring to warn_on if we fail the device change.)
(This fixes an ugly bug, I appreciate if you have any further comments).

Its not that impossible to imagine that a device OR a btrfs image is
been copied just by using the dd or the cp command. Which in case both
the copies of the btrfs will have the same fsid. If on the system with
automount enabled, the copied FS gets scanned.

We have a known bug in btrfs, that we let the device path be changed
after the device has been mounted. So using this loop hole the new
copied device would appears as if its mounted immediately after its
been copied.

For example:

Initially.. /dev/mmcblk0p4 is mounted as /

lsblk
NAMEMAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
mmcblk0 179:00 29.2G  0 disk
|-mmcblk0p4 179:404G  0 part /
|-mmcblk0p2 179:20  500M  0 part /boot
|-mmcblk0p3 179:30  256M  0 part [SWAP]
`-mmcblk0p1 179:10  256M  0 part /boot/efi

btrfs fi show
Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
Total devices 1 FS bytes used 1.40GiB
devid1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4

Copy mmcblk0 to sda
dd if=/dev/mmcblk0 of=/dev/sda

And immediately after the copy completes the change in the device
superblock is notified which the automount scans using
btrfs device scan and the new device sda becomes the mounted root
device.

lsblk
NAMEMAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
sda   8:01 14.9G  0 disk
|-sda48:414G  0 part /
|-sda28:21  500M  0 part
|-sda38:31  256M  0 part
`-sda18:11  256M  0 part
mmcblk0 179:00 29.2G  0 disk
|-mmcblk0p4 179:404G  0 part
|-mmcblk0p2 179:20  500M  0 part /boot
|-mmcblk0p3 179:30  256M  0 part [SWAP]
`-mmcblk0p1 179:10  256M  0 part /boot/efi

btrfs fi show /
  Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
  Total devices 1 FS bytes used 1.40GiB
  devid1 size 4.00GiB used 3.00GiB path /dev/sda4

The bug is quite nasty that you can't either unmount /dev/sda4 or
/dev/mmcblk0p4. And the problem does not get solved until you take
sda out of the system on to another system to change its fsid
using the 'btrfstune -u' command.

Signed-off-by: Anand Jain 


I'm adding the patch to misc-next now, with an update message that
matches the format when a device is scanned.

"BTRFS: device fsid %pU devid %llu moved old:%s new:%s\n",

That way it should be possible to grep for all messages that are related
to the scanning ioctl.


 Right. Looks fine to me. Thanks, Anand


Re: [PATCH RFC] btrfs: harden agaist duplicate fsid

2018-11-13 Thread Austin S. Hemmelgarn

On 11/13/2018 10:31 AM, David Sterba wrote:

On Mon, Oct 01, 2018 at 09:31:04PM +0800, Anand Jain wrote:

+    /*
+ * we are going to replace the device path, make sure its the
+ * same device if the device mounted
+ */
+    if (device->bdev) {
+    struct block_device *path_bdev;
+
+    path_bdev = lookup_bdev(path);
+    if (IS_ERR(path_bdev)) {
+    mutex_unlock(_devices->device_list_mutex);
+    return ERR_CAST(path_bdev);
+    }
+
+    if (device->bdev != path_bdev) {
+    bdput(path_bdev);
+    mutex_unlock(_devices->device_list_mutex);
+    return ERR_PTR(-EEXIST);

It would be _really_ nice to have an informative error message printed
here.  Aside from the possibility of an admin accidentally making a
block-level copy of the volume,



this code triggering could represent an
attempted attack against the system, so it's arguably something that
should be reported as happening.



   Personally, I think a WARN_ON_ONCE for
this would make sense, ideally per-volume if possible.


   Ah. Will add an warn. Thanks, Anand


The requested error message is not in the patch you posted or I have
missed that (https://patchwork.kernel.org/patch/10641041/) .

Austin, is the following ok for you?

   "BTRFS: duplicate device fsid:devid for %pU:%llu old:%s new:%s\n"

   BTRFS: duplicate device fsid:devid 7c667b96-59eb-43ad-9ae9-c878f6ad51d8:2 
old:/dev/sda6 new:/dev/sdb6

As the UUID and paths are long I tried to squeeeze the rest so it's
still comprehensible but this would be better confirmed. Thanks.


Looks perfectly fine to me.


Re: [PATCH RFC] btrfs: harden agaist duplicate fsid

2018-11-13 Thread David Sterba
On Mon, Oct 01, 2018 at 09:31:04PM +0800, Anand Jain wrote:
> >> +    /*
> >> + * we are going to replace the device path, make sure its the
> >> + * same device if the device mounted
> >> + */
> >> +    if (device->bdev) {
> >> +    struct block_device *path_bdev;
> >> +
> >> +    path_bdev = lookup_bdev(path);
> >> +    if (IS_ERR(path_bdev)) {
> >> +    mutex_unlock(_devices->device_list_mutex);
> >> +    return ERR_CAST(path_bdev);
> >> +    }
> >> +
> >> +    if (device->bdev != path_bdev) {
> >> +    bdput(path_bdev);
> >> +    mutex_unlock(_devices->device_list_mutex);
> >> +    return ERR_PTR(-EEXIST);
> > It would be _really_ nice to have an informative error message printed 
> > here.  Aside from the possibility of an admin accidentally making a 
> > block-level copy of the volume, 
> 
> > this code triggering could represent an 
> > attempted attack against the system, so it's arguably something that 
> > should be reported as happening.
> 
> >  Personally, I think a WARN_ON_ONCE for 
> > this would make sense, ideally per-volume if possible.
> 
>   Ah. Will add an warn. Thanks, Anand

The requested error message is not in the patch you posted or I have
missed that (https://patchwork.kernel.org/patch/10641041/) .

Austin, is the following ok for you?

  "BTRFS: duplicate device fsid:devid for %pU:%llu old:%s new:%s\n"

  BTRFS: duplicate device fsid:devid 7c667b96-59eb-43ad-9ae9-c878f6ad51d8:2 
old:/dev/sda6 new:/dev/sdb6

As the UUID and paths are long I tried to squeeeze the rest so it's
still comprehensible but this would be better confirmed. Thanks.


Re: [PATCH RFC RESEND] btrfs: harden agaist duplicate fsid

2018-11-13 Thread David Sterba
On Mon, Oct 15, 2018 at 10:45:17AM +0800, Anand Jain wrote:
> (Thanks for the comments on requiring to warn_on if we fail the device 
> change.)
> (This fixes an ugly bug, I appreciate if you have any further comments).
> 
> Its not that impossible to imagine that a device OR a btrfs image is
> been copied just by using the dd or the cp command. Which in case both
> the copies of the btrfs will have the same fsid. If on the system with
> automount enabled, the copied FS gets scanned.
> 
> We have a known bug in btrfs, that we let the device path be changed
> after the device has been mounted. So using this loop hole the new
> copied device would appears as if its mounted immediately after its
> been copied.
> 
> For example:
> 
> Initially.. /dev/mmcblk0p4 is mounted as /
> 
> lsblk
> NAMEMAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
> mmcblk0 179:00 29.2G  0 disk
> |-mmcblk0p4 179:404G  0 part /
> |-mmcblk0p2 179:20  500M  0 part /boot
> |-mmcblk0p3 179:30  256M  0 part [SWAP]
> `-mmcblk0p1 179:10  256M  0 part /boot/efi
> 
> btrfs fi show
>Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
>Total devices 1 FS bytes used 1.40GiB
>devid1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4
> 
> Copy mmcblk0 to sda
>dd if=/dev/mmcblk0 of=/dev/sda
> 
> And immediately after the copy completes the change in the device
> superblock is notified which the automount scans using
> btrfs device scan and the new device sda becomes the mounted root
> device.
> 
> lsblk
> NAMEMAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
> sda   8:01 14.9G  0 disk
> |-sda48:414G  0 part /
> |-sda28:21  500M  0 part
> |-sda38:31  256M  0 part
> `-sda18:11  256M  0 part
> mmcblk0 179:00 29.2G  0 disk
> |-mmcblk0p4 179:404G  0 part
> |-mmcblk0p2 179:20  500M  0 part /boot
> |-mmcblk0p3 179:30  256M  0 part [SWAP]
> `-mmcblk0p1 179:10  256M  0 part /boot/efi
> 
> btrfs fi show /
>  Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
>  Total devices 1 FS bytes used 1.40GiB
>  devid1 size 4.00GiB used 3.00GiB path /dev/sda4
> 
> The bug is quite nasty that you can't either unmount /dev/sda4 or
> /dev/mmcblk0p4. And the problem does not get solved until you take
> sda out of the system on to another system to change its fsid
> using the 'btrfstune -u' command.
> 
> Signed-off-by: Anand Jain 

I'm adding the patch to misc-next now, with an update message that
matches the format when a device is scanned.

"BTRFS: device fsid %pU devid %llu moved old:%s new:%s\n",

That way it should be possible to grep for all messages that are related
to the scanning ioctl.


Re: [PATCH 0/6] Chunk allocator DUP fix and cleanups

2018-11-13 Thread David Sterba
On Thu, Oct 11, 2018 at 07:40:22PM +, Hans van Kranenburg wrote:
> On 10/11/2018 05:13 PM, David Sterba wrote:
> > On Thu, Oct 04, 2018 at 11:24:37PM +0200, Hans van Kranenburg wrote:
> >> This patch set contains an additional fix for a newly exposed bug after
> >> the previous attempt to fix a chunk allocator bug for new DUP chunks:
> >>
> >> https://lore.kernel.org/linux-btrfs/782f6000-30c0-0085-abd2-74ec5827c...@mendix.com/T/#m609ccb5d32998e8ba5cfa9901c1ab56a38a6f374
> >>
> >> The DUP fix is "fix more DUP stripe size handling". I did that one
> >> before starting to change more things so it can be applied to earlier
> >> LTS kernels.
> >>
> >> Besides that patch, which is fixing the bug in a way that is least
> >> intrusive, I added a bunch of other patches to help getting the chunk
> >> allocator code in a state that is a bit less error-prone and
> >> bug-attracting.
> >>
> >> When running this and trying the reproduction scenario, I can now see
> >> that the created DUP device extent is 827326464 bytes long, which is
> >> good.
> >>
> >> I wrote and tested this on top of linus 4.19-rc5. I still need to create
> >> a list of related use cases and test more things to at least walk
> >> through a bunch of obvious use cases to see if there's nothing exploding
> >> too quickly with these changes. However, I'm quite confident about it,
> >> so I'm sharing all of it already.
> >>
> >> Any feedback and review is appreciated. Be gentle and keep in mind that
> >> I'm still very much in a learning stage regarding kernel development.
> > 
> > The patches look good, thanks. Problem is explained, preparatory work is
> > separated from the fix itself.
> 
> \o/
> 
> >> The stable patches handling workflow is not 100% clear to me yet. I
> >> guess I have to add a Fixes: in the DUP patch which points to the
> >> previous commit 92e222df7b.
> > 
> > Almost nobody does it right, no worries. If you can identify a single
> > patch that introduces a bug then it's for Fixes:, otherwise a CC: stable
> > with version where it makes sense & applies is enough. I do that check
> > myself regardless of what's in the patch.
> 
> It's 92e222df7b and the thing I'm not sure about is if it also will
> catch the same patch which was already backported to LTS kernels since
> 92e222df7b also has Fixes in it... So by now the new bug is in 4.19,
> 4.14, 4.9, 4.4, 3.16...
> 
> > I ran the patches in a VM and hit a division-by-zero in test
> > fstests/btrfs/011, stacktrace below. First guess is that it's caused by
> > patch 3/6.
> 
> Ah interesting, dev replace.
> 
> I'll play around with replace and find out how to run the tests properly
> and then reproduce this.
> 
> The code introduced in patch 3 is removed again in patch 6, so I don't
> suspect that one. :)
> 
> But, I'll find out.
> 
> Thanks for testing.

I've merged patches 1-5 to misc-next as they seem to be safe and pass
fstests, please let me know when you have updates to the last one.
Thanks.


Re: [PATCH] Btrfs: do not set log for full commit when creating non-data block groups

2018-11-13 Thread Filipe Manana
On Tue, Nov 13, 2018 at 2:31 PM David Sterba  wrote:
>
> On Thu, Nov 08, 2018 at 02:48:29PM +, Filipe Manana wrote:
> > On Thu, Nov 8, 2018 at 2:37 PM Filipe Manana  wrote:
> > >
> > > On Thu, Nov 8, 2018 at 2:35 PM Qu Wenruo  wrote:
> > > >
> > > >
> > > >
> > > > On 2018/11/8 下午9:17, fdman...@kernel.org wrote:
> > > > > From: Filipe Manana 
> > > > >
> > > > > When creating a block group we don't need to set the log for full 
> > > > > commit
> > > > > if the new block group is not used for data. Logged items can only 
> > > > > point
> > > > > to logical addresses of data block groups (through file extent items) 
> > > > > so
> > > > > there is no need to for the next fsync to fallback to a transaction 
> > > > > commit
> > > > > if the new block group is for metadata.
> > > >
> > > > Is it possible for the log tree blocks to be allocated in that new block
> > > > group?
> > >
> > > Yes.
> >
> > Now I realize what might be your concern, and this would cause trouble.
>
> Is this patch ok for for-next or does it need more work? Thanks.

Nop, it's no good (despite not triggering problems initially), due to
Qu's first question.
So just drop it and forget it.
Thanks.


Re: [PATCH v4] Btrfs: fix deadlock on tree root leaf when finding free extent

2018-11-13 Thread David Sterba
On Mon, Nov 05, 2018 at 04:36:34PM +, Filipe Manana wrote:
> On Mon, Nov 5, 2018 at 4:34 PM David Sterba  wrote:
> > On Mon, Nov 05, 2018 at 04:30:35PM +, Filipe Manana wrote:
> > > On Mon, Nov 5, 2018 at 4:29 PM David Sterba  wrote:
> > > > On Wed, Oct 24, 2018 at 01:48:40PM +0100, Filipe Manana wrote:
> > > > > > Ah ok makes sense.  Well in that case lets just make 
> > > > > > btrfs_read_locked_inode()
> > > > > > take a path, and allocate it in btrfs_iget, that'll remove the ugly
> > > > > >
> > > > > > if (path != in_path)
> > > > >
> > > > > You mean the following on top of v4:
> > > > >
> > > > > https://friendpaste.com/6XrGXb5p0RSJGixUFYouHg
> > > > >
> > > > > Not much different, just saves one such if statement. I'm ok with 
> > > > > that.
> > > >
> > > > Now in misc-next with v4 and the friendpaste incremental as
> > > >
> > > > https://github.com/kdave/btrfs-devel/commit/efcfd6c87d28b3aa9bcba52d7c1e1fc79a2dad69
> > >
> > > Please don't add the incremental. It's buggy. It was meant to figure
> > > out what Josef was saying. That's why I haven't sent a V5.
> >
> > Ok dropped, I'll will wait for a proper patch.
> 
> It's V4, the last sent version. Just forget the incremental.
> Thanks.

For the record, V4 has been merged to master in 4.20-rc2.


Re: [PATCH] Btrfs: do not set log for full commit when creating non-data block groups

2018-11-13 Thread David Sterba
On Thu, Nov 08, 2018 at 02:48:29PM +, Filipe Manana wrote:
> On Thu, Nov 8, 2018 at 2:37 PM Filipe Manana  wrote:
> >
> > On Thu, Nov 8, 2018 at 2:35 PM Qu Wenruo  wrote:
> > >
> > >
> > >
> > > On 2018/11/8 下午9:17, fdman...@kernel.org wrote:
> > > > From: Filipe Manana 
> > > >
> > > > When creating a block group we don't need to set the log for full commit
> > > > if the new block group is not used for data. Logged items can only point
> > > > to logical addresses of data block groups (through file extent items) so
> > > > there is no need to for the next fsync to fallback to a transaction 
> > > > commit
> > > > if the new block group is for metadata.
> > >
> > > Is it possible for the log tree blocks to be allocated in that new block
> > > group?
> >
> > Yes.
> 
> Now I realize what might be your concern, and this would cause trouble.

Is this patch ok for for-next or does it need more work? Thanks.


Re: [PATCH v15.1 00/13] Btrfs In-band De-duplication

2018-11-13 Thread David Sterba
On Tue, Nov 06, 2018 at 02:41:09PM +0800, Lu Fengqi wrote:
> This patchset can be fetched from github:
> https://github.com/littleroad/linux.git dedupe_latest
> 
> Now the new base is v4.20-rc1.

Before anybody spends more time with this patchset: this is a big
feature and quite intrusive to several btrfs subsystems. Currently it's
on hold as it requires finishing the design phase, it's still only the
in-memory backend and before we claim in-band dedupe, the persistent
hash tree needs to be at least drafted or prototyped.

At this point there are several features that are in a more complete
state so they get preferred when it comes to merging. I would have to
look up what was agreed long time ago as merging plan, but at this point
this series would require a lot of work.


Re: [PATCH v2] btrfs: add zstd compression level support

2018-11-13 Thread Timofey Titovets
вт, 13 нояб. 2018 г. в 04:52, Nick Terrell :
>
>
>
> > On Nov 12, 2018, at 4:33 PM, David Sterba  wrote:
> >
> > On Wed, Oct 31, 2018 at 11:11:08AM -0700, Nick Terrell wrote:
> >> From: Jennifer Liu 
> >>
> >> Adds zstd compression level support to btrfs. Zstd requires
> >> different amounts of memory for each level, so the design had
> >> to be modified to allow set_level() to allocate memory. We
> >> preallocate one workspace of the maximum size to guarantee
> >> forward progress. This feature is expected to be useful for
> >> read-mostly filesystems, or when creating images.
> >>
> >> Benchmarks run in qemu on Intel x86 with a single core.
> >> The benchmark measures the time to copy the Silesia corpus [0] to
> >> a btrfs filesystem 10 times, then read it back.
> >>
> >> The two important things to note are:
> >> - The decompression speed and memory remains constant.
> >>  The memory required to decompress is the same as level 1.
> >> - The compression speed and ratio will vary based on the source.
> >>
> >> LevelRatio   Compression Decompression   Compression Memory
> >> 12.59153 MB/s112 MB/s0.8 MB
> >> 22.67136 MB/s113 MB/s1.0 MB
> >> 32.72106 MB/s115 MB/s1.3 MB
> >> 42.7886  MB/s109 MB/s0.9 MB
> >> 52.8369  MB/s109 MB/s1.4 MB
> >> 62.8953  MB/s110 MB/s1.5 MB
> >> 72.9140  MB/s112 MB/s1.4 MB
> >> 82.9234  MB/s110 MB/s1.8 MB
> >> 92.9327  MB/s109 MB/s1.8 MB
> >> 10   2.9422  MB/s109 MB/s1.8 MB
> >> 11   2.9517  MB/s114 MB/s1.8 MB
> >> 12   2.9513  MB/s113 MB/s1.8 MB
> >> 13   2.9510  MB/s111 MB/s2.3 MB
> >> 14   2.997   MB/s110 MB/s2.6 MB
> >> 15   3.036   MB/s110 MB/s2.6 MB
> >>
> >> [0] 
> >> https://urldefense.proofpoint.com/v2/url?u=http-3A__sun.aei.polsl.pl_-7Esdeor_index.php-3Fpage-3Dsilesia=DwIBAg=5VD0RTtNlTh3ycd41b3MUw=HQM5IQdWOB8WaMoii2dYTw=5LQRTUqZnx_a8dGSa5bGsd0Fm4ejQQOcH50wi7nRewY=gFUm-SA3aeQI7PBe3zmxUuxk4AEEZegB0cRsbjWUToo=
> >>
> >> Signed-off-by: Jennifer Liu 
> >> Signed-off-by: Nick Terrell 
> >> Reviewed-by: Omar Sandoval 
> >> ---
> >> v1 -> v2:
> >> - Don't reflow the unchanged line.
> >>
> >> fs/btrfs/compression.c | 169 +
> >> fs/btrfs/compression.h |  18 +++--
> >> fs/btrfs/lzo.c |   5 +-
> >> fs/btrfs/super.c   |   7 +-
> >> fs/btrfs/zlib.c|  33 
> >> fs/btrfs/zstd.c|  74 +-
> >> 6 files changed, 202 insertions(+), 104 deletions(-)
> >>
> >> diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
> >> index 2955a4ea2fa8..b46652cb653e 100644
> >> --- a/fs/btrfs/compression.c
> >> +++ b/fs/btrfs/compression.c
> >> @@ -822,9 +822,12 @@ void __init btrfs_init_compress(void)
> >>
> >>  /*
> >>   * Preallocate one workspace for each compression type so
> >> - * we can guarantee forward progress in the worst case
> >> + * we can guarantee forward progress in the worst case.
> >> + * Provide the maximum compression level to guarantee large
> >> + * enough workspace.
> >>   */
> >> -workspace = btrfs_compress_op[i]->alloc_workspace();
> >> +workspace = btrfs_compress_op[i]->alloc_workspace(
> >> +btrfs_compress_op[i]->max_level);
>
> We provide the max level here, so we have at least one workspace per
> compression type that is large enough.
>
> >>  if (IS_ERR(workspace)) {
> >>  pr_warn("BTRFS: cannot preallocate compression 
> >> workspace, will try later\n");
> >>  } else {
> >> @@ -835,23 +838,78 @@ void __init btrfs_init_compress(void)
> >>  }
> >> }
> >>
> >> +/*
> >> + * put a workspace struct back on the list or free it if we have enough
> >> + * idle ones sitting around
> >> + */
> >> +static void __free_workspace(int type, struct list_head *workspace,
> >> + bool heuristic)
> >> +{
> >> +int idx = type - 1;
> >> +struct list_head *idle_ws;
> >> +spinlock_t *ws_lock;
> >> +atomic_t *total_ws;
> >> +wait_queue_head_t *ws_wait;
> >> +int *free_ws;
> >> +
> >> +if (heuristic) {
> >> +idle_ws  = _heuristic_ws.idle_ws;
> >> +ws_lock  = _heuristic_ws.ws_lock;
> >> +total_ws = _heuristic_ws.total_ws;
> >> +ws_wait  = _heuristic_ws.ws_wait;
> >> +free_ws  = _heuristic_ws.free_ws;
> >> +} else {
> >> +idle_ws  = _comp_ws[idx].idle_ws;
> >> +ws_lock  = _comp_ws[idx].ws_lock;
> >> +total_ws = 

Re: [PATCH] Btrfs: fix rare chances for data loss when doing a fast fsync

2018-11-13 Thread David Sterba
On Mon, Nov 12, 2018 at 10:23:58AM +, fdman...@kernel.org wrote:
> From: Filipe Manana 
> 
> After the simplification of the fast fsync patch done recently by commit
> b5e6c3e170b7 ("btrfs: always wait on ordered extents at fsync time") and
> commit e7175a692765 ("btrfs: remove the wait ordered logic in the
> log_one_extent path"), we got a very short time window where we can get
> extents logged without writeback completing first or extents logged
> without logging the respective data checksums. Both issues can only happen
> when doing a non-full (fast) fsync.
> 
> As soon as we enter btrfs_sync_file() we trigger writeback, then lock the
> inode and then wait for the writeback to complete before starting to log
> the inode. However before we acquire the inode's lock and after we started
> writeback, it's possible that more writes happened and dirtied more pages.
> If that happened and those pages get writeback triggered while we are
> logging the inode (for example, the VM subsystem triggering it due to
> memory pressure, or another concurrent fsync), we end up seeing the
> respective extent maps in the inode's list of modified extents and will
> log matching file extent items without waiting for the respective
> ordered extents to complete, meaning that either of the following will
> happen:
> 
> 1) We log an extent after its writeback finishes but before its checksums
>are added to the csum tree, leading to -EIO errors when attempting to
>read the extent after a log replay.
> 
> 2) We log an extent before its writeback finishes.
>Therefore after the log replay we will have a file extent item pointing
>to an unwritten extent (and without the respective data checksums as
>well).
> 
> This could not happen before the fast fsync patch simplification, because
> for any extent we found in the list of modified extents, we would wait for
> its respective ordered extent to finish writeback or collect its checksums
> for logging if it did not complete yet.
> 
> Fix this by triggering writeback again after acquiring the inode's lock
> and before waiting for ordered extents to complete.
> 
> Fixes: e7175a692765 ("btrfs: remove the wait ordered logic in the 
> log_one_extent path")
> Fixes: b5e6c3e170b7 ("btrfs: always wait on ordered extents at fsync time")
> CC: sta...@vger.kernel.org # 4.19+
> Signed-off-by: Filipe Manana 

Added to misc-next and queued for 4.20.

Thank you for such well written changelogs, it really helps to understand
what the patch does even if I'm not familiar with the code and the fsync
intricacies. A+


Re: [PATCH 1/3] bitops: Fix big endian compilation

2018-11-13 Thread David Sterba
On Mon, Nov 05, 2018 at 11:06:41AM -0800, Rosen Penev wrote:
> Replaced bswap with _ variants. While it's a glibc extension, all of the
> popular libc implementations (glibc, uClibc, musl, BIONIC) seem to support
> it.
> 
> Added static inline to two functions to match little endian variants. This
> fixes a linking error experienced when compiling.
> 
> Signed-off-by: Rosen Penev 

1-3 applied, thanks.


Re: [RFC] BTRFS_DEV_REPLACE_ITEM_STATE_* doesn't match with on disk

2018-11-13 Thread Anand Jain



David, Gentle ping.

Thanks, Anand

On 11/12/2018 03:50 PM, Nikolay Borisov wrote:



On 12.11.18 г. 6:58 ч., Anand Jain wrote:


The dev_replace_state defines are miss matched between the
BTRFS_IOCTL_DEV_REPLACE_STATE_* and BTRFS_DEV_REPLACE_ITEM_STATE_* [1].

[1]
-
btrfs.h:#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED    2
btrfs.h:#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED    3
btrfs.h:#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED    4

btrfs_tree.h:#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED    2
btrfs_tree.h:#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED    3
btrfs_tree.h:#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED    4
-

The BTRFS_DEV_REPLACE_ITEM_STATE_* series is unused in both btrfs.ko and
btrfs-progs, the on-disk also follows BTRFS_IOCTL_DEV_REPLACE_STATE_*
(we set dev_replace->replace_state using the
BTRFS_IOCTL_DEV_REPLACE_STATE_* defines and write to the on-disk).

  359 btrfs_set_dev_replace_replace_state(eb, ptr,
  360 dev_replace->replace_state);

IMO it should be ok to delete the BTRFS_DEV_REPLACE_ITEM_STATE_*
altogether? But how about the userland progs other than btrfs-progs?
If not at least fix the miss match as in [2], any comments?


Unfortunately you are right. This seems to stem from sloppy job back in
the days of initial dev-replace support. BTRFS_DEV_REPLACE_ITEM_STATE_*
were added in e922e087a35c ("Btrfs: enhance btrfs structures for device
replace support"), yet they were never used. And the
IOCTL_DEV_REPLACE_STATE* were added in e93c89c1 ("Btrfs: add new
sources for device replace code").

It looks like the ITEM_STATE* definitions were stillborn so to speak and
personally I'm in favor of removing them. They shouldn't have been
merged in the first place and indeed the patch doesn't even have a
Reviewed-by tag. So it originated from the, I'd say, spartan days of
btrfs development...

David,  any code which is using BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED
is inherently broken, so how about we remove those definitions, then
when it's compilation is broken in the future the author will actually
have a chance to fix it, though it's highly unlikely anyone is relying
on those definitions.




[2]
--
diff --git a/include/uapi/linux/btrfs_tree.h
b/include/uapi/linux/btrfs_tree.h
index aff1356c2bb8..9ffa7534cadf 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -805,9 +805,9 @@ struct btrfs_dev_stats_item {
  #define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
  #define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0
  #define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED   1
-#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2
-#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED  3
-#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED  4
+#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED  2
+#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED  3
+#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 4

  struct btrfs_dev_replace_item {
     /*
--


Thanks, Anand



Re: [PATCH] btrfs: fix computation of max fs size for multiple device fs tests

2018-11-13 Thread Anand Jain




On 11/06/2018 11:34 PM, fdman...@kernel.org wrote:

From: Filipe Manana 

We were sorting numerical values with the 'sort' tool without telling it
that we are sorting numbers, giving us unexpected ordering. So just pass
the '-n' option to the 'sort' tool.

Example:

$ echo -e "11\n9\n20" | sort
11
20
9

$ echo -e "11\n9\n20" | sort -n
9
11
20

Signed-off-by: Filipe Manana 


Thanks for the fix.

Reviewed-by: Anand Jain 




---
  tests/btrfs/124 | 2 +-
  tests/btrfs/125 | 2 +-
  tests/btrfs/154 | 2 +-
  3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/btrfs/124 b/tests/btrfs/124
index ce3ad6aa..a52c65f6 100755
--- a/tests/btrfs/124
+++ b/tests/btrfs/124
@@ -61,7 +61,7 @@ dev2=`echo $SCRATCH_DEV_POOL | awk '{print $2}'`
  dev1_sz=`blockdev --getsize64 $dev1`
  dev2_sz=`blockdev --getsize64 $dev2`
  # get min of both
-max_fs_sz=`echo -e "$dev1_sz\n$dev2_sz" | sort | head -1`
+max_fs_sz=`echo -e "$dev1_sz\n$dev2_sz" | sort -n | head -1`
  # Need disks with more than 2G.
  if [ $max_fs_sz -lt 20 ]; then
_scratch_dev_pool_put
diff --git a/tests/btrfs/125 b/tests/btrfs/125
index e38de264..5ac68b67 100755
--- a/tests/btrfs/125
+++ b/tests/btrfs/125
@@ -68,7 +68,7 @@ dev2_sz=`blockdev --getsize64 $dev2`
  dev3_sz=`blockdev --getsize64 $dev3`
  
  # get min of both.

-max_fs_sz=`echo -e "$dev1_sz\n$dev2_sz\n$dev3_sz" | sort | head -1`
+max_fs_sz=`echo -e "$dev1_sz\n$dev2_sz\n$dev3_sz" | sort -n | head -1`
  # Need disks with more than 2G
  if [ $max_fs_sz -lt 20 ]; then
_scratch_dev_pool_put
diff --git a/tests/btrfs/154 b/tests/btrfs/154
index 99ea232a..cd6c688f 100755
--- a/tests/btrfs/154
+++ b/tests/btrfs/154
@@ -51,7 +51,7 @@ DEV1_SZ=`blockdev --getsize64 $DEV1`
  DEV2_SZ=`blockdev --getsize64 $DEV2`
  
  # get min

-MAX_FS_SZ=`echo -e "$DEV1_SZ\n$DEV2_SZ" | sort | head -1`
+MAX_FS_SZ=`echo -e "$DEV1_SZ\n$DEV2_SZ" | sort -n | head -1`
  # Need disks with more than 2G
  if [ $MAX_FS_SZ -lt 20 ]; then
_scratch_dev_pool_put



Re: [PATCH] btrfs: qgroups: Prepare to deprecate BTRFS_QGROUP_LIMIT_RSV_RFER/EXCL flags

2018-11-13 Thread Qu Wenruo



On 2018/11/13 下午5:48, Nikolay Borisov wrote:
> 
> 
> On 13.11.18 г. 8:09 ч., Qu Wenruo wrote:
>> These two flags are only used to set btrfs_qgroup::rsv_rfer/excl number,
>> but then are never used.
>>
>> Nor these flags are really used by btrfs-progs, so it's pretty safe just
>> to deprecate them in next kernel release.
>>
>> This patch will mark these two flags deprecated and output warning
>> messages, but still handle them.
>>
>> These two flags will be completely removed in next kernel release.
>>
>> Signed-off-by: Qu Wenruo 
>> ---
>>  fs/btrfs/qgroup.c  | 16 ++--
>>  include/uapi/linux/btrfs.h | 12 ++--
>>  2 files changed, 16 insertions(+), 12 deletions(-)
>>
>> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
>> index 9afad8c14220..d4a52dbad044 100644
>> --- a/fs/btrfs/qgroup.c
>> +++ b/fs/btrfs/qgroup.c
>> @@ -1385,21 +1385,25 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle 
>> *trans,
>>  qgroup->max_excl = limit->max_excl;
>>  }
>>  }
>> -if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
>> +if (limit->flags & __BTRFS_QGROUP_LIMIT_RSV_RFER) {
>>  if (limit->rsv_rfer == CLEAR_VALUE) {
>> -qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
>> -limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
>> +qgroup->lim_flags &= ~__BTRFS_QGROUP_LIMIT_RSV_RFER;
>> +limit->flags &= ~__BTRFS_QGROUP_LIMIT_RSV_RFER;
>>  qgroup->rsv_rfer = 0;
>>  } else {
>> +btrfs_warn_rl(fs_info,
>> +"BTRFS_QGROUP_LIMIT_RSV_RFER flag is deprecated, will not be supported in 
>> v5.1");
>>  qgroup->rsv_rfer = limit->rsv_rfer;
>>  }
>>  }
>> -if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
>> +if (limit->flags & __BTRFS_QGROUP_LIMIT_RSV_EXCL) {
>>  if (limit->rsv_excl == CLEAR_VALUE) {
>> -qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
>> -limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
>> +qgroup->lim_flags &= ~__BTRFS_QGROUP_LIMIT_RSV_EXCL;
>> +limit->flags &= ~__BTRFS_QGROUP_LIMIT_RSV_EXCL;
>>  qgroup->rsv_excl = 0;
>>  } else {
>> +btrfs_warn_rl(fs_info,
>> +"BTRFS_QGROUP_LIMIT_RSV_EXCL flag is deprecated, will not be supported in 
>> v5.1");
>>  qgroup->rsv_excl = limit->rsv_excl;
>>  }
>>  }
>> diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
>> index db4c253f8011..ec79ab781ee0 100644
>> --- a/include/uapi/linux/btrfs.h
>> +++ b/include/uapi/linux/btrfs.h
>> @@ -53,12 +53,12 @@ struct btrfs_ioctl_vol_args {
>>   * struct btrfs_qgroup_limit.flags
>>   * struct btrfs_qgroup_limit_item.flags
>>   */
>> -#define BTRFS_QGROUP_LIMIT_MAX_RFER (1ULL << 0)
>> -#define BTRFS_QGROUP_LIMIT_MAX_EXCL (1ULL << 1)
>> -#define BTRFS_QGROUP_LIMIT_RSV_RFER (1ULL << 2)
>> -#define BTRFS_QGROUP_LIMIT_RSV_EXCL (1ULL << 3)
>> -#define BTRFS_QGROUP_LIMIT_RFER_CMPR(1ULL << 4)
>> -#define BTRFS_QGROUP_LIMIT_EXCL_CMPR(1ULL << 5)
>> +#define BTRFS_QGROUP_LIMIT_MAX_RFER (1ULL << 0) /* reference (rfer) limit */
>> +#define BTRFS_QGROUP_LIMIT_MAX_EXCL (1ULL << 1) /* exclusive (excl) limit */
>> +#define __BTRFS_QGROUP_LIMIT_RSV_RFER   (1ULL << 2) /* deprecated */
>> +#define __BTRFS_QGROUP_LIMIT_RSV_EXCL   (1ULL << 3) /* deprecated */
> 
> We gain absolutely nothing by prepending the __, drop it. What's
> sufficient for this patch is to introduce only the warnings, nothing else.

IMHO, there are 2 problems if we don't rename it:

1) Later user can still use it without checking the header.
   This is particularly common for anyone using completion tool, like
   ctags or clang completion.
   For these case, without checking the header careless user won't
   really know these flags are deprecated.

2) To show that there are really no existing users of these 2 flags
   It's much more clearly shown in the patch, than without modifying it.

3) To let us know whether there is any real user of these flags
   After such renaming, if we really have some users of these flags,
   they will mostly come and complain to the mail list.
   Such noise would help us to know the usage (if there is any).

So, I still prefer to rename them.
This also applies to user space counter part.

Thanks,
Qu

> 
>> +#define BTRFS_QGROUP_LIMIT_RFER_CMPR(1ULL << 4) /* compressed rfer 
>> limit */
>> +#define BTRFS_QGROUP_LIMIT_EXCL_CMPR(1ULL << 5) /* compressed excl 
>> limit */
>>  
>>  struct btrfs_qgroup_limit {
>>  __u64   flags;
>>


Re: [PATCH 1/2] btrfs-progs: ctree.h: Deprecate unused limit flags

2018-11-13 Thread Nikolay Borisov



On 13.11.18 г. 8:09 ч., Qu Wenruo wrote:
> BTRFS_QGROUP_LIMIT_RSV_RFER/EXCL are never used by btrfs-progs.
> And kernel only do the basic assignment but don't really make use of
> them.
> 
> So deprecate these two flags in btrfs-progs.
> Also, add some comment for all the used flags.
> 
> Signed-off-by: Qu Wenruo 
> ---
>  ctree.h  | 12 ++--
>  libbtrfsutil/btrfs.h | 12 ++--
>  2 files changed, 12 insertions(+), 12 deletions(-)
> 
> diff --git a/ctree.h b/ctree.h
> index 5d9ba3c555f1..1bbc5a2e9dee 100644
> --- a/ctree.h
> +++ b/ctree.h
> @@ -1038,12 +1038,12 @@ struct btrfs_qgroup_info_item {
>  } __attribute__ ((__packed__));
>  
>  /* flags definition for qgroup limits */
> -#define BTRFS_QGROUP_LIMIT_MAX_RFER  (1ULL << 0)
> -#define BTRFS_QGROUP_LIMIT_MAX_EXCL  (1ULL << 1)
> -#define BTRFS_QGROUP_LIMIT_RSV_RFER  (1ULL << 2)
> -#define BTRFS_QGROUP_LIMIT_RSV_EXCL  (1ULL << 3)
> -#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4)
> -#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5)
> +#define BTRFS_QGROUP_LIMIT_MAX_RFER  (1ULL << 0) /* reference (rfer) limit */
> +#define BTRFS_QGROUP_LIMIT_MAX_EXCL  (1ULL << 1) /* exclusive (excl) limit */
> +#define __BTRFS_QGROUP_LIMIT_RSV_RFER(1ULL << 2) /* deprecated */
> +#define __BTRFS_QGROUP_LIMIT_RSV_EXCL(1ULL << 3) /* deprecated */
> +#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4) /* compressed rfer limit */
> +#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5) /* compressed excl limit */
>  
>  struct btrfs_qgroup_limit_item {
>   __le64 flags;
> diff --git a/libbtrfsutil/btrfs.h b/libbtrfsutil/btrfs.h
> index 1893c5a2172e..4e58a3cd167b 100644
> --- a/libbtrfsutil/btrfs.h
> +++ b/libbtrfsutil/btrfs.h
> @@ -54,12 +54,12 @@ struct btrfs_ioctl_vol_args {
>   * struct btrfs_qgroup_limit.flags
>   * struct btrfs_qgroup_limit_item.flags
>   */
> -#define BTRFS_QGROUP_LIMIT_MAX_RFER  (1ULL << 0)
> -#define BTRFS_QGROUP_LIMIT_MAX_EXCL  (1ULL << 1)
> -#define BTRFS_QGROUP_LIMIT_RSV_RFER  (1ULL << 2)
> -#define BTRFS_QGROUP_LIMIT_RSV_EXCL  (1ULL << 3)
> -#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4)
> -#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5)
> +#define BTRFS_QGROUP_LIMIT_MAX_RFER  (1ULL << 0) /* reference (rfer) limit */
> +#define BTRFS_QGROUP_LIMIT_MAX_EXCL  (1ULL << 1) /* exclusive (excl) limit */
> +#define __BTRFS_QGROUP_LIMIT_RSV_RFER(1ULL << 2) /* deprecated */
> +#define __BTRFS_QGROUP_LIMIT_RSV_EXCL(1ULL << 3) /* deprecated */

Same feedback as the kernel counterpart - just put the /* deprecated */
comment, let the kernel warning settle for a few releases (removing it
in the next one might be a bit optimistic) and then just remove it.

> +#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4) /* compressed rfer limit */
> +#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5) /* compressed excl limit */
>  
>  struct btrfs_qgroup_limit {
>   __u64   flags;
> 


Re: [PATCH] btrfs: qgroup: Remove duplicated trace points for qgroup_rsv_add/release()

2018-11-13 Thread Nikolay Borisov



On 13.11.18 г. 9:05 ч., Qu Wenruo wrote:
> Inside qgroup_rsv_add/release(), we have trace events
> trace_qgroup_update_reserve() to catch reserved space update.
> 
> However we still have two manual trace_qgroup_update_reserve() calls
> just outside these functions.
> 
> Remove these duplicated calls.
> 
> Fixes: 64ee4e751a1c ("btrfs: qgroup: Update trace events to use new separate 
> rsv types")
> Signed-off-by: Qu Wenruo 

Reviewed-by: Nikolay Borisov 

> ---
>  fs/btrfs/qgroup.c | 2 --
>  1 file changed, 2 deletions(-)
> 
> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> index 45868fd76209..c1fbd8a30fc4 100644
> --- a/fs/btrfs/qgroup.c
> +++ b/fs/btrfs/qgroup.c
> @@ -2952,7 +2952,6 @@ static int qgroup_reserve(struct btrfs_root *root, u64 
> num_bytes, bool enforce,
>  
>   qg = unode_aux_to_qgroup(unode);
>  
> - trace_qgroup_update_reserve(fs_info, qg, num_bytes, type);
>   qgroup_rsv_add(fs_info, qg, num_bytes, type);
>   }
>  
> @@ -3019,7 +3018,6 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info 
> *fs_info,
>  
>   qg = unode_aux_to_qgroup(unode);
>  
> - trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes, type);
>   qgroup_rsv_release(fs_info, qg, num_bytes, type);
>  
>   list_for_each_entry(glist, >groups, next_group) {
> 


Re: [PATCH] btrfs: qgroups: Prepare to deprecate BTRFS_QGROUP_LIMIT_RSV_RFER/EXCL flags

2018-11-13 Thread Nikolay Borisov



On 13.11.18 г. 8:09 ч., Qu Wenruo wrote:
> These two flags are only used to set btrfs_qgroup::rsv_rfer/excl number,
> but then are never used.
> 
> Nor these flags are really used by btrfs-progs, so it's pretty safe just
> to deprecate them in next kernel release.
> 
> This patch will mark these two flags deprecated and output warning
> messages, but still handle them.
> 
> These two flags will be completely removed in next kernel release.
> 
> Signed-off-by: Qu Wenruo 
> ---
>  fs/btrfs/qgroup.c  | 16 ++--
>  include/uapi/linux/btrfs.h | 12 ++--
>  2 files changed, 16 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> index 9afad8c14220..d4a52dbad044 100644
> --- a/fs/btrfs/qgroup.c
> +++ b/fs/btrfs/qgroup.c
> @@ -1385,21 +1385,25 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle 
> *trans,
>   qgroup->max_excl = limit->max_excl;
>   }
>   }
> - if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
> + if (limit->flags & __BTRFS_QGROUP_LIMIT_RSV_RFER) {
>   if (limit->rsv_rfer == CLEAR_VALUE) {
> - qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
> - limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
> + qgroup->lim_flags &= ~__BTRFS_QGROUP_LIMIT_RSV_RFER;
> + limit->flags &= ~__BTRFS_QGROUP_LIMIT_RSV_RFER;
>   qgroup->rsv_rfer = 0;
>   } else {
> + btrfs_warn_rl(fs_info,
> +"BTRFS_QGROUP_LIMIT_RSV_RFER flag is deprecated, will not be supported in 
> v5.1");
>   qgroup->rsv_rfer = limit->rsv_rfer;
>   }
>   }
> - if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
> + if (limit->flags & __BTRFS_QGROUP_LIMIT_RSV_EXCL) {
>   if (limit->rsv_excl == CLEAR_VALUE) {
> - qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
> - limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
> + qgroup->lim_flags &= ~__BTRFS_QGROUP_LIMIT_RSV_EXCL;
> + limit->flags &= ~__BTRFS_QGROUP_LIMIT_RSV_EXCL;
>   qgroup->rsv_excl = 0;
>   } else {
> + btrfs_warn_rl(fs_info,
> +"BTRFS_QGROUP_LIMIT_RSV_EXCL flag is deprecated, will not be supported in 
> v5.1");
>   qgroup->rsv_excl = limit->rsv_excl;
>   }
>   }
> diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
> index db4c253f8011..ec79ab781ee0 100644
> --- a/include/uapi/linux/btrfs.h
> +++ b/include/uapi/linux/btrfs.h
> @@ -53,12 +53,12 @@ struct btrfs_ioctl_vol_args {
>   * struct btrfs_qgroup_limit.flags
>   * struct btrfs_qgroup_limit_item.flags
>   */
> -#define BTRFS_QGROUP_LIMIT_MAX_RFER  (1ULL << 0)
> -#define BTRFS_QGROUP_LIMIT_MAX_EXCL  (1ULL << 1)
> -#define BTRFS_QGROUP_LIMIT_RSV_RFER  (1ULL << 2)
> -#define BTRFS_QGROUP_LIMIT_RSV_EXCL  (1ULL << 3)
> -#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4)
> -#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5)
> +#define BTRFS_QGROUP_LIMIT_MAX_RFER  (1ULL << 0) /* reference (rfer) limit */
> +#define BTRFS_QGROUP_LIMIT_MAX_EXCL  (1ULL << 1) /* exclusive (excl) limit */
> +#define __BTRFS_QGROUP_LIMIT_RSV_RFER(1ULL << 2) /* deprecated */
> +#define __BTRFS_QGROUP_LIMIT_RSV_EXCL(1ULL << 3) /* deprecated */

We gain absolutely nothing by prepending the __, drop it. What's
sufficient for this patch is to introduce only the warnings, nothing else.

> +#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4) /* compressed rfer limit */
> +#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5) /* compressed excl limit */
>  
>  struct btrfs_qgroup_limit {
>   __u64   flags;
>