[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-04-24 Thread Andrew via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rG74f00516e5ce: [DFSAN] Add support for strsep. (authored by 
tkuchta, committed by browneee).
Herald added a subscriber: Sanitizers.

Changed prior to commit:
  https://reviews.llvm.org/D141389?vs=51=516515#toc

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

Files:
  compiler-rt/lib/dfsan/dfsan_custom.cpp
  compiler-rt/lib/dfsan/done_abilist.txt
  compiler-rt/test/dfsan/custom.cpp

Index: compiler-rt/test/dfsan/custom.cpp
===
--- compiler-rt/test/dfsan/custom.cpp
+++ compiler-rt/test/dfsan/custom.cpp
@@ -1630,6 +1630,51 @@
 #endif
 }
 
+void test_strsep() {
+  char *s = strdup("Hello world/");
+  char *delim = strdup(" /");
+
+  char *p_s = s;
+  char *base = s;
+  char *p_delim = delim;
+
+  // taint delim bytes
+  dfsan_set_label(i_label, p_delim, strlen(p_delim));
+  // taint delim pointer
+  dfsan_set_label(j_label, _delim, sizeof(p_delim));
+  // taint the string data bytes
+  dfsan_set_label(k_label, s, 5);
+  // taint the string pointer
+  dfsan_set_label(m_label, _s, sizeof(p_s));
+
+  char *rv = strsep(_s, p_delim);
+  assert(rv == [0]);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_LABEL(rv, m_label);
+  ASSERT_READ_LABEL(rv, strlen(rv), k_label);
+#else
+  ASSERT_LABEL(rv, dfsan_union(dfsan_union(i_label, j_label),
+   dfsan_union(k_label, m_label)));
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, p_s);
+#endif
+
+  // taint the remaining string's pointer
+  char **pp_s = _s;
+  char **pp_s_base = pp_s;
+  dfsan_set_label(n_label, pp_s, sizeof(pp_s));
+
+  rv = strsep(pp_s, p_delim);
+
+  assert(rv == [6]);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_LABEL(rv, n_label);
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, *pp_s);
+#else
+  ASSERT_LABEL(rv, dfsan_union(dfsan_union(i_label, j_label), n_label));
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, *pp_s);
+#endif
+}
+
 void test_memchr() {
   char str1[] = "str1";
   dfsan_set_label(i_label, [3], 1);
@@ -2044,6 +2089,7 @@
   test_strncmp();
   test_strncpy();
   test_strpbrk();
+  test_strsep();
   test_strrchr();
   test_strstr();
   test_strtod();
Index: compiler-rt/lib/dfsan/done_abilist.txt
===
--- compiler-rt/lib/dfsan/done_abilist.txt
+++ compiler-rt/lib/dfsan/done_abilist.txt
@@ -283,6 +283,7 @@
 fun:strpbrk=custom
 fun:strrchr=custom
 fun:strstr=custom
+fun:strsep=custom
 
 # Functions which take action based on global state, such as running a callback
 # set by a separate function.
Index: compiler-rt/lib/dfsan/dfsan_custom.cpp
===
--- compiler-rt/lib/dfsan/dfsan_custom.cpp
+++ compiler-rt/lib/dfsan/dfsan_custom.cpp
@@ -204,6 +204,57 @@
   return const_cast(ret);
 }
 
+SANITIZER_INTERFACE_ATTRIBUTE char *__dfsw_strsep(char **s, const char *delim,
+  dfsan_label s_label,
+  dfsan_label delim_label,
+  dfsan_label *ret_label) {
+  dfsan_label base_label = dfsan_read_label(s, sizeof(*s));
+  char *base = *s;
+  char *res = strsep(s, delim);
+  if (res != *s) {
+char *token_start = res;
+int token_length = strlen(res);
+// the delimiter byte has been set to NULL
+dfsan_set_label(0, token_start + token_length, 1);
+  }
+
+  if (flags().strict_data_dependencies) {
+*ret_label = res ? base_label : 0;
+  } else {
+size_t s_bytes_read = (res ? strlen(res) : strlen(base)) + 1;
+*ret_label = dfsan_union(
+dfsan_union(base_label, dfsan_read_label(base, sizeof(s_bytes_read))),
+dfsan_union(dfsan_read_label(delim, strlen(delim) + 1),
+dfsan_union(s_label, delim_label)));
+  }
+
+  return res;
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE char *__dfso_strsep(
+char **s, const char *delim, dfsan_label s_label, dfsan_label delim_label,
+dfsan_label *ret_label, dfsan_origin s_origin, dfsan_origin delim_origin,
+dfsan_origin *ret_origin) {
+  dfsan_origin base_origin = dfsan_read_origin_of_first_taint(s, sizeof(*s));
+  char *res = __dfsw_strsep(s, delim, s_label, delim_label, ret_label);
+  if (flags().strict_data_dependencies) {
+if (res)
+  *ret_origin = base_origin;
+  } else {
+if (*ret_label) {
+  if (base_origin) {
+*ret_origin = base_origin;
+  } else {
+dfsan_origin o =
+dfsan_read_origin_of_first_taint(delim, strlen(delim) + 1);
+*ret_origin = o ? o : (s_label ? s_origin : delim_origin);
+  }
+}
+  }
+
+  return res;
+}
+
 static int dfsan_memcmp_bcmp(const void *s1, const void *s2, size_t n,
  size_t *bytes_read) {
   const char *cs1 = (const char *) s1, *cs2 = (const char *) s2;

[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-04-17 Thread Tomasz Kuchta via Phabricator via cfe-commits
tkuchta added a comment.

Got it, thank you.
Please submit the patch for e-mail t.kuc...@samsung.com
I will prepare next reviews for the other functions.
Thank you!


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-04-17 Thread Andrew via Phabricator via cfe-commits
browneee added a comment.

Separate review.

Keep patches as small as possible, do one review for each patch.

If you'd like me to submit it for you, please provide your email to credit the 
patch to.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-04-13 Thread Tomasz Kuchta via Phabricator via cfe-commits
tkuchta updated this revision to Diff 51.
tkuchta added a comment.

Thank you, I have uploaded a new diff with "&" fixed in the test function.

How should be proceed with the other functions once strsep is done? Should I 
open another review or add the code to the current patch?


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

Files:
  compiler-rt/lib/dfsan/dfsan_custom.cpp
  compiler-rt/lib/dfsan/done_abilist.txt
  compiler-rt/test/dfsan/custom.cpp

Index: compiler-rt/test/dfsan/custom.cpp
===
--- compiler-rt/test/dfsan/custom.cpp
+++ compiler-rt/test/dfsan/custom.cpp
@@ -1627,6 +1627,51 @@
 #endif
 }
 
+void test_strsep() {
+  char *s = strdup("Hello world/");
+  char *delim = strdup(" /");
+
+  char *p_s = s;
+  char *base = s;
+  char *p_delim = delim;
+
+  // taint delim bytes
+  dfsan_set_label(i_label, p_delim, strlen(p_delim));
+  // taint delim pointer
+  dfsan_set_label(j_label, _delim, sizeof(p_delim));
+  // taint the string data bytes
+  dfsan_set_label(k_label, s, 5);
+  // taint the string pointer
+  dfsan_set_label(m_label, _s, sizeof(p_s));
+
+  char *rv = strsep(_s, p_delim);
+  assert(rv == [0]);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_LABEL(rv, m_label);
+  ASSERT_READ_LABEL(rv, strlen(rv), k_label);
+#else
+  ASSERT_LABEL(rv, dfsan_union(dfsan_union(i_label, j_label),
+   dfsan_union(k_label, m_label)));
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, p_s);
+#endif
+
+  // taint the remaining string's pointer
+  char **pp_s = _s;
+  char **pp_s_base = pp_s;
+  dfsan_set_label(n_label, pp_s, sizeof(pp_s));
+
+  rv = strsep(pp_s, p_delim);
+
+  assert(rv == [6]);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_LABEL(rv, n_label);
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, *pp_s);
+#else
+  ASSERT_LABEL(rv, dfsan_union(dfsan_union(i_label, j_label), n_label));
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, *pp_s);
+#endif
+}
+
 void test_memchr() {
   char str1[] = "str1";
   dfsan_set_label(i_label, [3], 1);
@@ -2041,6 +2086,7 @@
   test_strncmp();
   test_strncpy();
   test_strpbrk();
+  test_strsep();
   test_strrchr();
   test_strstr();
   test_strtod();
Index: compiler-rt/lib/dfsan/done_abilist.txt
===
--- compiler-rt/lib/dfsan/done_abilist.txt
+++ compiler-rt/lib/dfsan/done_abilist.txt
@@ -283,6 +283,7 @@
 fun:strpbrk=custom
 fun:strrchr=custom
 fun:strstr=custom
+fun:strsep=custom
 
 # Functions which take action based on global state, such as running a callback
 # set by a separate function.
Index: compiler-rt/lib/dfsan/dfsan_custom.cpp
===
--- compiler-rt/lib/dfsan/dfsan_custom.cpp
+++ compiler-rt/lib/dfsan/dfsan_custom.cpp
@@ -204,6 +204,58 @@
   return const_cast(ret);
 }
 
+SANITIZER_INTERFACE_ATTRIBUTE char *__dfsw_strsep(char **s, const char *delim,
+  dfsan_label s_label,
+  dfsan_label delim_label,
+  dfsan_label *ret_label) {
+  dfsan_label base_label = dfsan_read_label(s, sizeof(*s));
+  char *base = *s;
+  char *res = strsep(s, delim);
+  if (res != *s) {
+char *token_start = res;
+int token_length = strlen(res);
+// the delimiter byte has been set to NULL
+dfsan_set_label(0, token_start + token_length, 1);
+  }
+
+  if (flags().strict_data_dependencies) {
+*ret_label = res ? base_label : 0;
+  } else {
+size_t s_bytes_read = (res ? strlen(res) : strlen(base)) + 1;
+*ret_label = dfsan_union(
+dfsan_union(base_label, dfsan_read_label(base, sizeof(s_bytes_read))),
+dfsan_union(dfsan_read_label(delim, strlen(delim) + 1),
+dfsan_union(s_label, delim_label)));
+  }
+
+  return res;
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE char *__dfso_strsep(
+char **s, const char *delim, dfsan_label s_label, dfsan_label delim_label,
+dfsan_label *ret_label, dfsan_origin s_origin, dfsan_origin delim_origin,
+dfsan_origin *ret_origin) {
+  dfsan_origin base_origin = dfsan_read_origin_of_first_taint(s, sizeof(*s));
+  char *base = *s;
+  char *res = __dfsw_strsep(s, delim, s_label, delim_label, ret_label);
+  if (flags().strict_data_dependencies) {
+if (res)
+  *ret_origin = base_origin;
+  } else {
+if (*ret_label) {
+  if (base_origin) {
+*ret_origin = base_origin;
+  } else {
+dfsan_origin o =
+dfsan_read_origin_of_first_taint(delim, strlen(delim) + 1);
+*ret_origin = o ? o : (s_label ? s_origin : delim_origin);
+  }
+}
+  }
+
+  return res;
+}
+
 static int dfsan_memcmp_bcmp(const void *s1, const void *s2, size_t n,
  size_t *bytes_read) {
   const char *cs1 = (const char *) s1, *cs2 = (const char *) s2;
___

[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-04-13 Thread Andrew via Phabricator via cfe-commits
browneee accepted this revision.
browneee added a comment.
This revision is now accepted and ready to land.

Please fix the additional &.

Do you need me to submit it for you?




Comment at: compiler-rt/test/dfsan/custom.cpp:1645
+  // taint the string pointer
+  dfsan_set_label(m_label, _s, sizeof(_s));
+

Remove & in sizeof.



Comment at: compiler-rt/test/dfsan/custom.cpp:1641
+  // taint delim pointer
+  dfsan_set_label(j_label, _delim, sizeof(_delim));
+  // taint the string data bytes

browneee wrote:
> remove the &
> 
> It still works because `sizeof(pointer_var) == sizeof(_var)`, but it 
> doesn't logically match up like it should.
> 
> (Sorry, this one is my fault - I made this mistake giving an example in an 
> earlier comment.)
Remove & in sizeof.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-04-03 Thread Tomasz Kuchta via Phabricator via cfe-commits
tkuchta updated this revision to Diff 510442.
tkuchta added a comment.

Applied the latest review comments for strsep


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

Files:
  compiler-rt/lib/dfsan/dfsan_custom.cpp
  compiler-rt/lib/dfsan/done_abilist.txt
  compiler-rt/test/dfsan/custom.cpp

Index: compiler-rt/test/dfsan/custom.cpp
===
--- compiler-rt/test/dfsan/custom.cpp
+++ compiler-rt/test/dfsan/custom.cpp
@@ -1627,6 +1627,51 @@
 #endif
 }
 
+void test_strsep() {
+  char *s = strdup("Hello world/");
+  char *delim = strdup(" /");
+
+  char *p_s = s;
+  char *base = s;
+  char *p_delim = delim;
+
+  // taint delim bytes
+  dfsan_set_label(i_label, p_delim, strlen(p_delim));
+  // taint delim pointer
+  dfsan_set_label(j_label, _delim, sizeof(_delim));
+  // taint the string data bytes
+  dfsan_set_label(k_label, s, 5);
+  // taint the string pointer
+  dfsan_set_label(m_label, _s, sizeof(_s));
+
+  char *rv = strsep(_s, p_delim);
+  assert(rv == [0]);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_LABEL(rv, m_label);
+  ASSERT_READ_LABEL(rv, strlen(rv), k_label);
+#else
+  ASSERT_LABEL(rv, dfsan_union(dfsan_union(i_label, j_label),
+   dfsan_union(k_label, m_label)));
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, p_s);
+#endif
+
+  // taint the remaining string's pointer
+  char **pp_s = _s;
+  char **pp_s_base = pp_s;
+  dfsan_set_label(n_label, pp_s, sizeof(pp_s));
+
+  rv = strsep(pp_s, p_delim);
+
+  assert(rv == [6]);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_LABEL(rv, n_label);
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, *pp_s);
+#else
+  ASSERT_LABEL(rv, dfsan_union(dfsan_union(i_label, j_label), n_label));
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, *pp_s);
+#endif
+}
+
 void test_memchr() {
   char str1[] = "str1";
   dfsan_set_label(i_label, [3], 1);
@@ -2041,6 +2086,7 @@
   test_strncmp();
   test_strncpy();
   test_strpbrk();
+  test_strsep();
   test_strrchr();
   test_strstr();
   test_strtod();
Index: compiler-rt/lib/dfsan/done_abilist.txt
===
--- compiler-rt/lib/dfsan/done_abilist.txt
+++ compiler-rt/lib/dfsan/done_abilist.txt
@@ -283,6 +283,7 @@
 fun:strpbrk=custom
 fun:strrchr=custom
 fun:strstr=custom
+fun:strsep=custom
 
 # Functions which take action based on global state, such as running a callback
 # set by a separate function.
Index: compiler-rt/lib/dfsan/dfsan_custom.cpp
===
--- compiler-rt/lib/dfsan/dfsan_custom.cpp
+++ compiler-rt/lib/dfsan/dfsan_custom.cpp
@@ -204,6 +204,58 @@
   return const_cast(ret);
 }
 
+SANITIZER_INTERFACE_ATTRIBUTE char *__dfsw_strsep(char **s, const char *delim,
+  dfsan_label s_label,
+  dfsan_label delim_label,
+  dfsan_label *ret_label) {
+  dfsan_label base_label = dfsan_read_label(s, sizeof(*s));
+  char *base = *s;
+  char *res = strsep(s, delim);
+  if (res != *s) {
+char *token_start = res;
+int token_length = strlen(res);
+// the delimiter byte has been set to NULL
+dfsan_set_label(0, token_start + token_length, 1);
+  }
+
+  if (flags().strict_data_dependencies) {
+*ret_label = res ? base_label : 0;
+  } else {
+size_t s_bytes_read = (res ? strlen(res) : strlen(base)) + 1;
+*ret_label = dfsan_union(
+dfsan_union(base_label, dfsan_read_label(base, sizeof(s_bytes_read))),
+dfsan_union(dfsan_read_label(delim, strlen(delim) + 1),
+dfsan_union(s_label, delim_label)));
+  }
+
+  return res;
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE char *__dfso_strsep(
+char **s, const char *delim, dfsan_label s_label, dfsan_label delim_label,
+dfsan_label *ret_label, dfsan_origin s_origin, dfsan_origin delim_origin,
+dfsan_origin *ret_origin) {
+  dfsan_origin base_origin = dfsan_read_origin_of_first_taint(s, sizeof(*s));
+  char *base = *s;
+  char *res = __dfsw_strsep(s, delim, s_label, delim_label, ret_label);
+  if (flags().strict_data_dependencies) {
+if (res)
+  *ret_origin = base_origin;
+  } else {
+if (*ret_label) {
+  if (base_origin) {
+*ret_origin = base_origin;
+  } else {
+dfsan_origin o =
+dfsan_read_origin_of_first_taint(delim, strlen(delim) + 1);
+*ret_origin = o ? o : (s_label ? s_origin : delim_origin);
+  }
+}
+  }
+
+  return res;
+}
+
 static int dfsan_memcmp_bcmp(const void *s1, const void *s2, size_t n,
  size_t *bytes_read) {
   const char *cs1 = (const char *) s1, *cs2 = (const char *) s2;
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-03-31 Thread Andrew via Phabricator via cfe-commits
browneee added inline comments.



Comment at: compiler-rt/lib/dfsan/dfsan_custom.cpp:221
+  if (flags().strict_data_dependencies) {
+*ret_label = res ? dfsan_read_label(base, sizeof(base)) : 0;
+  } else {

tkuchta wrote:
> browneee wrote:
> > `base, sizeof(base)` does not make sense - the size does not correspond to 
> > the pointer used.
> > 
> > Either
> > ```
> > dfsan_read_label(base, sizeof(*base))   // first byte pointed to by base
> > dfsan_read_label(base, strlen(base)) // whole string pointed to by base
> > dfsan_read_label(, sizeof(base))  // the base pointer
> > ```
> > 
> > In this case I think we want the base pointer.
> > 
> > `dfsan_read_label(, sizeof(base))`  // the base pointer
> > should be equivalent to doing
> > `dfsan_label base_label = dfsan_read_label(s, sizeof(*s))`  up at the start 
> > of the function, just after we declare base then use `base_label` here.
> > 
> > Lets go with the second option to avoid taking the address of a variable.
> > 
> > This is semantically equivalent to my first suggestion: 
> > `dfsan_get_label(base) == dfsan_read_label(, sizeof(base)) == 
> > base_label`.
> > Sorry I didn't consider the other constraints (no dfsan_get_label in this 
> > file because the pass is not run on this code; avoid taking address of 
> > variable) and suggest this in the first place.
> Thank you for your comments! Just to clarify:
> If we have strict data dependencies we would like *ret_label to be related to 
> the taints of the contents of the string, is that correct? Should we use the 
> base_label there too? My understanding is that base_label represents a taint 
> applied to the pointer, not to the data? 
> 
> In other words, would that be correct to:
> 1) use taints of the string data in the first case (strict dependencies) -> 
> therefore no base_label there as it represents the taint of the pointer
> 2) use the taints of the string data + the taints of the pointer in the 
> second case -> therefore using base_label there
> If we have strict data dependencies we would like *ret_label to be related to 
> the taints of the contents of the string, is that correct?

No, `*ret_label` holds the taint label for the return value, which is a 
pointer. The return value pointer is derived from `*s`, aka `base`. This is the 
pointer to the string contents, not the string contents themselves.

> Should we use the base_label there too?

I think we should only use `base_label` here.

> My understanding is that base_label represents a taint applied to the 
> pointer, not to the data?

Correct.


> In other words, would that be correct to:
> 1. use taints of the string data in the first case (strict dependencies) -> 
> therefore no base_label there as it represents the taint of the pointer

No, other way around. We want the base_label, but not the string data.

> 2. use the taints of the string data + the taints of the pointer in the 
> second case -> therefore using base_label there

Yes, we could have both base_label and the taints of the string data in the 
else case.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-03-31 Thread Tomasz Kuchta via Phabricator via cfe-commits
tkuchta added inline comments.



Comment at: compiler-rt/lib/dfsan/dfsan_custom.cpp:221
+  if (flags().strict_data_dependencies) {
+*ret_label = res ? dfsan_read_label(base, sizeof(base)) : 0;
+  } else {

browneee wrote:
> `base, sizeof(base)` does not make sense - the size does not correspond to 
> the pointer used.
> 
> Either
> ```
> dfsan_read_label(base, sizeof(*base))   // first byte pointed to by base
> dfsan_read_label(base, strlen(base)) // whole string pointed to by base
> dfsan_read_label(, sizeof(base))  // the base pointer
> ```
> 
> In this case I think we want the base pointer.
> 
> `dfsan_read_label(, sizeof(base))`  // the base pointer
> should be equivalent to doing
> `dfsan_label base_label = dfsan_read_label(s, sizeof(*s))`  up at the start 
> of the function, just after we declare base then use `base_label` here.
> 
> Lets go with the second option to avoid taking the address of a variable.
> 
> This is semantically equivalent to my first suggestion: 
> `dfsan_get_label(base) == dfsan_read_label(, sizeof(base)) == 
> base_label`.
> Sorry I didn't consider the other constraints (no dfsan_get_label in this 
> file because the pass is not run on this code; avoid taking address of 
> variable) and suggest this in the first place.
Thank you for your comments! Just to clarify:
If we have strict data dependencies we would like *ret_label to be related to 
the taints of the contents of the string, is that correct? Should we use the 
base_label there too? My understanding is that base_label represents a taint 
applied to the pointer, not to the data? 

In other words, would that be correct to:
1) use taints of the string data in the first case (strict dependencies) -> 
therefore no base_label there as it represents the taint of the pointer
2) use the taints of the string data + the taints of the pointer in the second 
case -> therefore using base_label there


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-03-31 Thread Andrew via Phabricator via cfe-commits
browneee added a comment.

We're getting really close!

Yes, that build error looks unrelated. Someone should fix it soon.




Comment at: compiler-rt/lib/dfsan/dfsan_custom.cpp:221
+  if (flags().strict_data_dependencies) {
+*ret_label = res ? dfsan_read_label(base, sizeof(base)) : 0;
+  } else {

`base, sizeof(base)` does not make sense - the size does not correspond to the 
pointer used.

Either
```
dfsan_read_label(base, sizeof(*base))   // first byte pointed to by base
dfsan_read_label(base, strlen(base)) // whole string pointed to by base
dfsan_read_label(, sizeof(base))  // the base pointer
```

In this case I think we want the base pointer.

`dfsan_read_label(, sizeof(base))`  // the base pointer
should be equivalent to doing
`dfsan_label base_label = dfsan_read_label(s, sizeof(*s))`  up at the start of 
the function, just after we declare base then use `base_label` here.

Lets go with the second option to avoid taking the address of a variable.

This is semantically equivalent to my first suggestion: `dfsan_get_label(base) 
== dfsan_read_label(, sizeof(base)) == base_label`.
Sorry I didn't consider the other constraints (no dfsan_get_label in this file 
because the pass is not run on this code; avoid taking address of variable) and 
suggest this in the first place.



Comment at: compiler-rt/lib/dfsan/dfsan_custom.cpp:224
+*ret_label =
+dfsan_union(dfsan_read_label(base, sizeof(base)),
+dfsan_union(dfsan_read_label(delim, strlen(delim) + 1),

Also use `base_label` here.



Comment at: compiler-rt/lib/dfsan/dfsan_custom.cpp:240
+if (res)
+  *ret_origin = dfsan_read_origin_of_first_taint(base, strlen(base));
+  } else {

`dfsan_get_origin(base) == dfsan_read_origin_of_first_taint(, 
sizeof(base))`

As noted above `base, strlen(base)` is a meaningfully valid pointer, length - 
but it is not the level of indirection we want here.

I think we want the same solution as above.

`dfsan_origin base_origin = dfsan_read_origin_of_first_taint(s, sizeof(*s))` at 
the start of the function, just after declaring and assigning base.



Comment at: compiler-rt/lib/dfsan/dfsan_custom.cpp:243
+if (*ret_label) {
+  dfsan_origin o = dfsan_read_origin_of_first_taint(base, strlen(base));
+  if (o) {

Also use `base_origin` here.



Comment at: compiler-rt/test/dfsan/custom.cpp:1641
+  // taint delim pointer
+  dfsan_set_label(j_label, _delim, sizeof(_delim));
+  // taint the string data bytes

remove the &

It still works because `sizeof(pointer_var) == sizeof(_var)`, but it 
doesn't logically match up like it should.

(Sorry, this one is my fault - I made this mistake giving an example in an 
earlier comment.)



Comment at: compiler-rt/test/dfsan/custom.cpp:1645
+  // taint the string pointer
+  dfsan_set_label(m_label, _s, sizeof(_s));
+

remove & in sizeof



Comment at: compiler-rt/test/dfsan/custom.cpp:1650
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_LABEL(rv, k_label);
+  ASSERT_READ_LABEL(rv, strlen(rv), k_label);

The value `rv` has a data flow from the the string pointer, not the string 
bytes.
It should have `m_label` not `k_label`.

This is related to line 221 (using `base` instead of ``) in the other file.



Comment at: compiler-rt/test/dfsan/custom.cpp:1654
+  ASSERT_LABEL(rv, dfsan_union(dfsan_union(i_label, j_label), k_label));
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, *s);
+#endif

rv is a pointer, and I think it's origin should match the pointer s, not the 
bytes in the string.

I think this should be:

`ASSERT_INIT_ORIGIN_EQ_ORIGIN(, s);`

This is related to line 240 (`*ret_origin = 
dfsan_read_origin_of_first_taint(base, strlen(base));`) in the other file being 
the wrong level of indirection.


(Side note: the existing ASSERT_INIT_ORIGIN_EQ_ORIGIN macro feels a bit odd in 
that the arguments are at different levels of indirection - but not something 
to fix as part of this change)



Comment at: compiler-rt/test/dfsan/custom.cpp:1660
+  char **pp_s_base = pp_s;
+  dfsan_set_label(n_label, _s, sizeof(_s));
+

remove & in sizeof


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-03-31 Thread Tomasz Kuchta via Phabricator via cfe-commits
tkuchta added a comment.

there is a strange build error which seems unrelated to my change - please let 
me know if that's an issue, I will try to rebase to newest master then


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-03-31 Thread Tomasz Kuchta via Phabricator via cfe-commits
tkuchta updated this revision to Diff 510047.
tkuchta added a comment.

Updates after the review of strsep.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

Files:
  compiler-rt/lib/dfsan/dfsan_custom.cpp
  compiler-rt/lib/dfsan/done_abilist.txt
  compiler-rt/test/dfsan/custom.cpp

Index: compiler-rt/test/dfsan/custom.cpp
===
--- compiler-rt/test/dfsan/custom.cpp
+++ compiler-rt/test/dfsan/custom.cpp
@@ -1627,6 +1627,50 @@
 #endif
 }
 
+void test_strsep() {
+  char *s = strdup("Hello world/");
+  char *delim = strdup(" /");
+
+  char *p_s = s;
+  char *base = s;
+  char *p_delim = delim;
+
+  // taint delim bytes
+  dfsan_set_label(i_label, p_delim, strlen(p_delim));
+  // taint delim pointer
+  dfsan_set_label(j_label, _delim, sizeof(_delim));
+  // taint the string data bytes
+  dfsan_set_label(k_label, s, 5);
+  // taint the string pointer
+  dfsan_set_label(m_label, _s, sizeof(_s));
+
+  char *rv = strsep(_s, p_delim);
+  assert(rv == [0]);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_LABEL(rv, k_label);
+  ASSERT_READ_LABEL(rv, strlen(rv), k_label);
+#else
+  ASSERT_LABEL(rv, dfsan_union(dfsan_union(i_label, j_label), k_label));
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, *s);
+#endif
+
+  // taint the remaining string's pointer
+  char **pp_s = _s;
+  char **pp_s_base = pp_s;
+  dfsan_set_label(n_label, _s, sizeof(_s));
+
+  rv = strsep(pp_s, p_delim);
+
+  assert(rv == [6]);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_ZERO_LABEL(rv);
+  ASSERT_ZERO_ORIGIN(rv);
+#else
+  ASSERT_LABEL(rv, dfsan_union(i_label, dfsan_union(j_label, n_label)));
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, *p_delim);
+#endif
+}
+
 void test_memchr() {
   char str1[] = "str1";
   dfsan_set_label(i_label, [3], 1);
@@ -2041,6 +2085,7 @@
   test_strncmp();
   test_strncpy();
   test_strpbrk();
+  test_strsep();
   test_strrchr();
   test_strstr();
   test_strtod();
Index: compiler-rt/lib/dfsan/done_abilist.txt
===
--- compiler-rt/lib/dfsan/done_abilist.txt
+++ compiler-rt/lib/dfsan/done_abilist.txt
@@ -283,6 +283,7 @@
 fun:strpbrk=custom
 fun:strrchr=custom
 fun:strstr=custom
+fun:strsep=custom
 
 # Functions which take action based on global state, such as running a callback
 # set by a separate function.
Index: compiler-rt/lib/dfsan/dfsan_custom.cpp
===
--- compiler-rt/lib/dfsan/dfsan_custom.cpp
+++ compiler-rt/lib/dfsan/dfsan_custom.cpp
@@ -204,6 +204,55 @@
   return const_cast(ret);
 }
 
+SANITIZER_INTERFACE_ATTRIBUTE char *__dfsw_strsep(char **s, const char *delim,
+  dfsan_label s_label,
+  dfsan_label delim_label,
+  dfsan_label *ret_label) {
+  char *base = *s;
+  char *res = strsep(s, delim);
+  if (res != *s) {
+char *token_start = res;
+int token_length = strlen(res);
+// the delimiter byte has been set to NULL
+dfsan_set_label(0, token_start + token_length, 1);
+  }
+
+  if (flags().strict_data_dependencies) {
+*ret_label = res ? dfsan_read_label(base, sizeof(base)) : 0;
+  } else {
+*ret_label =
+dfsan_union(dfsan_read_label(base, sizeof(base)),
+dfsan_union(dfsan_read_label(delim, strlen(delim) + 1),
+dfsan_union(s_label, delim_label)));
+  }
+
+  return res;
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE char *__dfso_strsep(
+char **s, const char *delim, dfsan_label s_label, dfsan_label delim_label,
+dfsan_label *ret_label, dfsan_origin s_origin, dfsan_origin delim_origin,
+dfsan_origin *ret_origin) {
+  char *base = *s;
+  char *res = __dfsw_strsep(s, delim, s_label, delim_label, ret_label);
+  if (flags().strict_data_dependencies) {
+if (res)
+  *ret_origin = dfsan_read_origin_of_first_taint(base, strlen(base));
+  } else {
+if (*ret_label) {
+  dfsan_origin o = dfsan_read_origin_of_first_taint(base, strlen(base));
+  if (o) {
+*ret_origin = o;
+  } else {
+o = dfsan_read_origin_of_first_taint(delim, strlen(delim) + 1);
+*ret_origin = o ? o : (s_label ? s_origin : delim_origin);
+  }
+}
+  }
+
+  return res;
+}
+
 static int dfsan_memcmp_bcmp(const void *s1, const void *s2, size_t n,
  size_t *bytes_read) {
   const char *cs1 = (const char *) s1, *cs2 = (const char *) s2;
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-03-31 Thread Tomasz Kuchta via Phabricator via cfe-commits
tkuchta updated this revision to Diff 510041.
tkuchta marked an inline comment as done.
tkuchta added a comment.

Hello, I applied the review comments for strsep.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

Files:
  compiler-rt/lib/dfsan/dfsan_custom.cpp
  compiler-rt/test/dfsan/custom.cpp

Index: compiler-rt/test/dfsan/custom.cpp
===
--- compiler-rt/test/dfsan/custom.cpp
+++ compiler-rt/test/dfsan/custom.cpp
@@ -1636,80 +1636,38 @@
   char *p_delim = delim;
 
   // taint delim bytes
-  dfsan_set_label(n_label, p_delim, strlen(p_delim));
+  dfsan_set_label(i_label, p_delim, strlen(p_delim));
   // taint delim pointer
-  dfsan_set_label(i_label, _delim, sizeof(_delim));
+  dfsan_set_label(j_label, _delim, sizeof(_delim));
+  // taint the string data bytes
+  dfsan_set_label(k_label, s, 5);
+  // taint the string pointer
+  dfsan_set_label(m_label, _s, sizeof(_s));
 
   char *rv = strsep(_s, p_delim);
   assert(rv == [0]);
 #ifdef STRICT_DATA_DEPENDENCIES
-  ASSERT_ZERO_LABEL(rv);
-  ASSERT_READ_ZERO_LABEL(rv, strlen(rv));
+  ASSERT_LABEL(rv, k_label);
+  ASSERT_READ_LABEL(rv, strlen(rv), k_label);
 #else
-  ASSERT_LABEL(rv, dfsan_union(i_label, n_label));
-  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, *p_delim);
+  ASSERT_LABEL(rv, dfsan_union(dfsan_union(i_label, j_label), k_label));
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, *s);
 #endif
 
-  // taint the remaining string's bytes
-  dfsan_set_label(m_label, p_s, strlen(p_s));
   // taint the remaining string's pointer
   char **pp_s = _s;
-  dfsan_set_label(j_label, _s, sizeof(_s));
+  char **pp_s_base = pp_s;
+  dfsan_set_label(n_label, _s, sizeof(_s));
 
   rv = strsep(pp_s, p_delim);
 
   assert(rv == [6]);
-#ifdef STRICT_DATA_DEPENDENCIES
-  ASSERT_READ_LABEL(rv, strlen(rv), m_label);
-  ASSERT_LABEL(rv, j_label);
-  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, pp_s);
-#else
-  ASSERT_LABEL(
-  rv, dfsan_union(j_label,
-  dfsan_union(i_label, dfsan_union(m_label, n_label;
-  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, base[6]);
-#endif
-
-  free(s);
-  s = strdup("Hello world/");
-  base = s;
-  free(delim);
-  delim = strdup(" /");
-  p_delim = delim;
-
-  dfsan_set_label(j_label, [0], 1);
-
-  rv = strsep(, delim);
-  assert(rv == [0]);
 #ifdef STRICT_DATA_DEPENDENCIES
   ASSERT_ZERO_LABEL(rv);
+  ASSERT_ZERO_ORIGIN(rv);
 #else
-  ASSERT_LABEL(rv, j_label);
-  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, delim[1]);
-#endif
-
-  char *ps = s;
-  pp_s = 
-  dfsan_set_label(i_label, _s, sizeof(_s));
-  dfsan_set_label(i_label, ps, strlen(ps));
-  dfsan_set_label(dfsan_union(j_label, dfsan_read_label(ps, strlen(ps))), ps,
-  strlen(ps));
-  rv = strsep(pp_s, " /");
-  assert(rv == [6]);
-#ifdef STRICT_DATA_DEPENDENCIES
-  ASSERT_LABEL(rv, i_label);
-#else
-  ASSERT_LABEL(rv, i_j_label);
-  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, base[6]);
-#endif
-  rv = strsep(, " /");
-  assert(strlen(rv) == 0);
-#ifdef STRICT_DATA_DEPENDENCIES
-  ASSERT_ZERO_LABEL(ps);
-#else
-  ASSERT_ZERO_LABEL(rv);
-  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, 0);
-
+  ASSERT_LABEL(rv, dfsan_union(i_label, dfsan_union(j_label, n_label)));
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, *p_delim);
 #endif
 }
 
Index: compiler-rt/lib/dfsan/dfsan_custom.cpp
===
--- compiler-rt/lib/dfsan/dfsan_custom.cpp
+++ compiler-rt/lib/dfsan/dfsan_custom.cpp
@@ -218,11 +218,10 @@
   }
 
   if (flags().strict_data_dependencies) {
-*ret_label = res ? s_label : 0;
+*ret_label = res ? dfsan_read_label(base, sizeof(base)) : 0;
   } else {
-size_t s_bytes_read = (res ? strlen(res) : strlen(base)) + 1;
 *ret_label =
-dfsan_union(dfsan_read_label(base, s_bytes_read),
+dfsan_union(dfsan_read_label(base, sizeof(base)),
 dfsan_union(dfsan_read_label(delim, strlen(delim) + 1),
 dfsan_union(s_label, delim_label)));
   }
@@ -238,11 +237,10 @@
   char *res = __dfsw_strsep(s, delim, s_label, delim_label, ret_label);
   if (flags().strict_data_dependencies) {
 if (res)
-  *ret_origin = s_origin;
+  *ret_origin = dfsan_read_origin_of_first_taint(base, strlen(base));
   } else {
 if (*ret_label) {
-  size_t s_bytes_read = (res ? strlen(res) : strlen(base)) + 1;
-  dfsan_origin o = dfsan_read_origin_of_first_taint(base, s_bytes_read);
+  dfsan_origin o = dfsan_read_origin_of_first_taint(base, strlen(base));
   if (o) {
 *ret_origin = o;
   } else {
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-03-02 Thread Andrew via Phabricator via cfe-commits
browneee added inline comments.



Comment at: compiler-rt/lib/dfsan/dfsan_custom.cpp:221
+  if (flags().strict_data_dependencies) {
+*ret_label = res ? s_label : 0;
+  } else {

tkuchta wrote:
> browneee wrote:
> > When `res != NULL`, then `res` is derived from `*s`, not from `s`.
> > 
> > e.g.
> > 
> > ```
> > *ret_label = res ? dfsan_get_label(base) : 0;
> > ```
> Apologies for a delay.
> 
> I came across some difficulty with using dfsan_get_label inside the 
> dfsan_custom.cpp file.
> It seems that including the dfsan_interface.h header there, which would be 
> needed for dfsan_get_label, causes other conflicts and build errors.
> Would there be another way to use that function inside dfsan_custom.cpp? 
```
char *base = *s;
```

Here base is loaded, so we can also load the shadow.

`dfsan_get_label(base) == dfsan_read_label(s, sizeof(*s))`

If base was an argument, then it would have a corresponding label argument as 
part of the wrapper function, so you also wouldn't need `dfsan_get_label`.


Repository:
  rCRT Compiler Runtime

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-03-01 Thread Tomasz Kuchta via Phabricator via cfe-commits
tkuchta added inline comments.



Comment at: compiler-rt/lib/dfsan/dfsan_custom.cpp:221
+  if (flags().strict_data_dependencies) {
+*ret_label = res ? s_label : 0;
+  } else {

browneee wrote:
> When `res != NULL`, then `res` is derived from `*s`, not from `s`.
> 
> e.g.
> 
> ```
> *ret_label = res ? dfsan_get_label(base) : 0;
> ```
Apologies for a delay.

I came across some difficulty with using dfsan_get_label inside the 
dfsan_custom.cpp file.
It seems that including the dfsan_interface.h header there, which would be 
needed for dfsan_get_label, causes other conflicts and build errors.
Would there be another way to use that function inside dfsan_custom.cpp? 


Repository:
  rCRT Compiler Runtime

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-02-07 Thread Andrew via Phabricator via cfe-commits
browneee added inline comments.



Comment at: compiler-rt/lib/dfsan/dfsan_custom.cpp:221
+  if (flags().strict_data_dependencies) {
+*ret_label = res ? s_label : 0;
+  } else {

When `res != NULL`, then `res` is derived from `*s`, not from `s`.

e.g.

```
*ret_label = res ? dfsan_get_label(base) : 0;
```



Comment at: compiler-rt/lib/dfsan/dfsan_custom.cpp:224
+size_t s_bytes_read = (res ? strlen(res) : strlen(base)) + 1;
+*ret_label =
+dfsan_union(dfsan_read_label(base, s_bytes_read),

I think `dfsan_get_label(base)` should also be a part of this output.



Comment at: compiler-rt/lib/dfsan/dfsan_custom.cpp:241
+if (res)
+  *ret_origin = s_origin;
+  } else {

As above, when res != NULL, then res is derived from *s, not from s.

```
if (res)
*ret_origin = dfsan_get_origin(base);
```



Comment at: compiler-rt/lib/dfsan/dfsan_custom.cpp:243
+  } else {
+if (*ret_label) {
+  size_t s_bytes_read = (res ? strlen(res) : strlen(base)) + 1;

As above, I think `dfsan_get_origin(base)` should be the first priority, as it 
is the origin with the most direct value flow.

Note that `dfsan_get_origin(base)` gives the origin id for `base` whereas 
`dfsan_read_origin_of_first_taint(base, ...)` gives origin id for `*base`.



Comment at: compiler-rt/test/dfsan/custom.cpp:1659
+
+  rv = strsep(pp_s, p_delim);
+

I think this test can call strsep fewer times. It just needs to set all the 
different labels first.

Different inputs to taint (each one should be a distinct taint label):
* `*p_delim`   (with `dfsan_set_label(..., p_delim, strlen(p_delim));`)
* `p_delim`   (with `dfsan_set_label(..., _delim, sizeof(_delim));`)
* `*s`   (with `dfsan_set_label(..., s, strlen(s));`)
* `s` aka `p_s`   (with `dfsan_set_label(..., , sizeof());` or 
`dfsan_set_label(..., _s, sizeof(_s));`)
* `pp_s` (with `dfsan_set_label(..., _s, sizeof(_s));`)

This is less than 8, so we can do it all at once (unless you want to test 
different taint labels on different bytes of the input strings).


Repository:
  rCRT Compiler Runtime

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-02-06 Thread Tomasz Kuchta via Phabricator via cfe-commits
tkuchta updated this revision to Diff 495268.
tkuchta added a comment.

Please find the patch for strsep only attached


Repository:
  rCRT Compiler Runtime

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

Files:
  compiler-rt/lib/dfsan/dfsan_custom.cpp
  compiler-rt/lib/dfsan/done_abilist.txt
  compiler-rt/test/dfsan/custom.cpp

Index: compiler-rt/test/dfsan/custom.cpp
===
--- compiler-rt/test/dfsan/custom.cpp
+++ compiler-rt/test/dfsan/custom.cpp
@@ -1627,6 +1627,92 @@
 #endif
 }
 
+void test_strsep() {
+  char *s = strdup("Hello world/");
+  char *delim = strdup(" /");
+
+  char *p_s = s;
+  char *base = s;
+  char *p_delim = delim;
+
+  // taint delim bytes
+  dfsan_set_label(n_label, p_delim, strlen(p_delim));
+  // taint delim pointer
+  dfsan_set_label(i_label, _delim, sizeof(_delim));
+
+  char *rv = strsep(_s, p_delim);
+  assert(rv == [0]);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_ZERO_LABEL(rv);
+  ASSERT_READ_ZERO_LABEL(rv, strlen(rv));
+#else
+  ASSERT_LABEL(rv, dfsan_union(i_label, n_label));
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, *p_delim);
+#endif
+
+  // taint the remaining string's bytes
+  dfsan_set_label(m_label, p_s, strlen(p_s));
+  // taint the remaining string's pointer
+  char **pp_s = _s;
+  dfsan_set_label(j_label, _s, sizeof(_s));
+
+  rv = strsep(pp_s, p_delim);
+
+  assert(rv == [6]);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_READ_LABEL(rv, strlen(rv), m_label);
+  ASSERT_LABEL(rv, j_label);
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, pp_s);
+#else
+  ASSERT_LABEL(
+  rv, dfsan_union(j_label,
+  dfsan_union(i_label, dfsan_union(m_label, n_label;
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, base[6]);
+#endif
+
+  free(s);
+  s = strdup("Hello world/");
+  base = s;
+  free(delim);
+  delim = strdup(" /");
+  p_delim = delim;
+
+  dfsan_set_label(j_label, [0], 1);
+
+  rv = strsep(, delim);
+  assert(rv == [0]);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_ZERO_LABEL(rv);
+#else
+  ASSERT_LABEL(rv, j_label);
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, delim[1]);
+#endif
+
+  char *ps = s;
+  pp_s = 
+  dfsan_set_label(i_label, _s, sizeof(_s));
+  dfsan_set_label(i_label, ps, strlen(ps));
+  dfsan_set_label(dfsan_union(j_label, dfsan_read_label(ps, strlen(ps))), ps,
+  strlen(ps));
+  rv = strsep(pp_s, " /");
+  assert(rv == [6]);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_LABEL(rv, i_label);
+#else
+  ASSERT_LABEL(rv, i_j_label);
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, base[6]);
+#endif
+  rv = strsep(, " /");
+  assert(strlen(rv) == 0);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_ZERO_LABEL(ps);
+#else
+  ASSERT_ZERO_LABEL(rv);
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, 0);
+
+#endif
+}
+
 void test_memchr() {
   char str1[] = "str1";
   dfsan_set_label(i_label, [3], 1);
@@ -2041,6 +2127,7 @@
   test_strncmp();
   test_strncpy();
   test_strpbrk();
+  test_strsep();
   test_strrchr();
   test_strstr();
   test_strtod();
Index: compiler-rt/lib/dfsan/done_abilist.txt
===
--- compiler-rt/lib/dfsan/done_abilist.txt
+++ compiler-rt/lib/dfsan/done_abilist.txt
@@ -283,6 +283,7 @@
 fun:strpbrk=custom
 fun:strrchr=custom
 fun:strstr=custom
+fun:strsep=custom
 
 # Functions which take action based on global state, such as running a callback
 # set by a separate function.
Index: compiler-rt/lib/dfsan/dfsan_custom.cpp
===
--- compiler-rt/lib/dfsan/dfsan_custom.cpp
+++ compiler-rt/lib/dfsan/dfsan_custom.cpp
@@ -204,6 +204,57 @@
   return const_cast(ret);
 }
 
+SANITIZER_INTERFACE_ATTRIBUTE char *__dfsw_strsep(char **s, const char *delim,
+  dfsan_label s_label,
+  dfsan_label delim_label,
+  dfsan_label *ret_label) {
+  char *base = *s;
+  char *res = strsep(s, delim);
+  if (res != *s) {
+char *token_start = res;
+int token_length = strlen(res);
+// the delimiter byte has been set to NULL
+dfsan_set_label(0, token_start + token_length, 1);
+  }
+
+  if (flags().strict_data_dependencies) {
+*ret_label = res ? s_label : 0;
+  } else {
+size_t s_bytes_read = (res ? strlen(res) : strlen(base)) + 1;
+*ret_label =
+dfsan_union(dfsan_read_label(base, s_bytes_read),
+dfsan_union(dfsan_read_label(delim, strlen(delim) + 1),
+dfsan_union(s_label, delim_label)));
+  }
+
+  return res;
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE char *__dfso_strsep(
+char **s, const char *delim, dfsan_label s_label, dfsan_label delim_label,
+dfsan_label *ret_label, dfsan_origin s_origin, dfsan_origin delim_origin,
+dfsan_origin *ret_origin) {
+  char *base = *s;
+  char *res = __dfsw_strsep(s, delim, s_label, delim_label, ret_label);
+  if 

[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-02-04 Thread Tomasz Kuchta via Phabricator via cfe-commits
tkuchta added inline comments.



Comment at: compiler-rt/lib/dfsan/dfsan_custom.cpp:213
+  char *res = strsep(s, delim);
+  s_label = dfsan_read_label(base, strlen(base));
+  if (res && (res != base)) {

browneee wrote:
> tkuchta wrote:
> > browneee wrote:
> > > tkuchta wrote:
> > > > browneee wrote:
> > > > > tkuchta wrote:
> > > > > > browneee wrote:
> > > > > > > The `s_label` represents the taint label for `s` (the pointer).
> > > > > > > 
> > > > > > > This line would clobber the taint label of the pointer (`s`) with 
> > > > > > > a taint label from `s[0][0..n]`.
> > > > > > > 
> > > > > > > I think this line should be deleted.
> > > > > > Agree, s_label represents the taint associated with the **s 
> > > > > > pointer. However I am now wondering if that is the taint wich we 
> > > > > > would like to return.
> > > > > > For example, if we have
> > > > > > if (flags().strict_data_dependencies) {
> > > > > > *ret_label = res ? s_label : 0;
> > > > > > 
> > > > > > We would taint the return value with the value of the pointer, not 
> > > > > > the data. It means that if we operate on a string for which the 
> > > > > > characters are tainted, but the pointer itself isn't, we are likely 
> > > > > > going to return label 0. My understanding was that we are more 
> > > > > > concerned with the taint of the data, not the pointer, am I missing 
> > > > > > something?
> > > > > > 
> > > > > Yes, we are usually more concerned with the taint of the data, not 
> > > > > the pointer.
> > > > > 
> > > > > With strict dependencies:
> > > > > // If the input pointer is tainted, the output pointer would be 
> > > > > tainted (because it is derived from the input pointer - maybe the 
> > > > > same value).
> > > > > taint(s[0]) == dfsan_read_label(s, sizeof(s)) > taint(ret) == 
> > > > > ret_label[0]
> > > > > 
> > > > > // If the input data is tainted, the output data would be tainted 
> > > > > (because it is derived from the input data).
> > > > > taint(s[0][0]) == MEM_TO_SHADOW(s[0])[0] > taint(ret[0]) == 
> > > > > MEM_TO_SHADOW(ret)[0]
> > > > > 
> > > > > Because s[0] == ret  (or ret==null), (for the non-null case) the 
> > > > > output shadow bytes are the same bytes as input shadow bytes and so 
> > > > > these taint labels for the string data in shadow memory do not need 
> > > > > to be explicitly propagated in this function. 
> > > > > 
> > > > > I think the only case actually changing/copying string data is 
> > > > > writing a delimiter byte to NULL, which you handled.
> > > > I am working on the changes and I came across a strange behavior that I 
> > > > would like to ask about.
> > > > 
> > > > It turned out that if we do
> > > > 
> > > > char *s = " ... ";
> > > > dfsan_set_label(label, _s, sizeof(_s));
> > > > 
> > > > Then, the s_label within the handler is 0, not "label". This is 
> > > > unexpected, as we would like the pointer itself to be labeled here.
> > > > I believe this has something to do with the fact that the input string 
> > > > in strsep is a double pointer. For example this works as expected for 
> > > > the delimiter string, which is a single pointer. 
> > > > It's either I'm doing something incorrectly or there is some issue with 
> > > > propagating labels for double pointers.
> > > > Have you perhaps come across this behavior before?
> > > I'm not sure what p_s is in your example. Could you provide a more 
> > > complete example?
> > > (and maybe all in one function, not half in __dfsw_strsep and half in 
> > > another function)
> > > 
> > > Here is an example showing taint labels at different levels of 
> > > indirection:
> > > 
> > > ```
> > > #include 
> > > #include 
> > > #include 
> > > 
> > > int main(void) {
> > >   char *s = " ... ";
> > >   char **p_s = 
> > >   char ***pp_s = _s;
> > > 
> > >   dfsan_label i_label = 1 << 0;
> > >   dfsan_label j_label = 1 << 1;
> > >   dfsan_label k_label = 1 << 2;
> > >   dfsan_label m_label = 1 << 3;
> > > 
> > >   // string data
> > >   dfsan_set_label(i_label, s, strlen(s));
> > >   // pointer s
> > >   dfsan_set_label(j_label, , sizeof(s));
> > >   // pointer to pointer s
> > >   dfsan_set_label(k_label, _s, sizeof(p_s));
> > >   // pointer to pointer to pointer s
> > >   dfsan_set_label(m_label, _s, sizeof(pp_s));
> > > 
> > >   assert(pp_s[0][0] == s);
> > > 
> > >   // string data
> > >   assert(dfsan_read_label(s, strlen(s)) == i_label);
> > >   // pointer s
> > >   assert(dfsan_read_label(, sizeof(s)) == j_label);
> > >   // pointer to pointer s
> > >   assert(dfsan_read_label(_s, sizeof(p_s)) == k_label);
> > >   // pointer to pointer to pointer s
> > >   assert(dfsan_read_label(_s, sizeof(pp_s)) == m_label);
> > > 
> > >   return 0;
> > > }
> > > ```
> > Hello,
> > 
> > Thank you for the comment.
> > 
> > I should have provided a more complete example.
> > What I meant is a behavior I've found while working on the tests.
> > In the test file I have something like that:
> > 
> 

[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-02-03 Thread Andrew via Phabricator via cfe-commits
browneee added inline comments.



Comment at: compiler-rt/lib/dfsan/dfsan_custom.cpp:213
+  char *res = strsep(s, delim);
+  s_label = dfsan_read_label(base, strlen(base));
+  if (res && (res != base)) {

tkuchta wrote:
> browneee wrote:
> > tkuchta wrote:
> > > browneee wrote:
> > > > tkuchta wrote:
> > > > > browneee wrote:
> > > > > > The `s_label` represents the taint label for `s` (the pointer).
> > > > > > 
> > > > > > This line would clobber the taint label of the pointer (`s`) with a 
> > > > > > taint label from `s[0][0..n]`.
> > > > > > 
> > > > > > I think this line should be deleted.
> > > > > Agree, s_label represents the taint associated with the **s pointer. 
> > > > > However I am now wondering if that is the taint wich we would like to 
> > > > > return.
> > > > > For example, if we have
> > > > > if (flags().strict_data_dependencies) {
> > > > > *ret_label = res ? s_label : 0;
> > > > > 
> > > > > We would taint the return value with the value of the pointer, not 
> > > > > the data. It means that if we operate on a string for which the 
> > > > > characters are tainted, but the pointer itself isn't, we are likely 
> > > > > going to return label 0. My understanding was that we are more 
> > > > > concerned with the taint of the data, not the pointer, am I missing 
> > > > > something?
> > > > > 
> > > > Yes, we are usually more concerned with the taint of the data, not the 
> > > > pointer.
> > > > 
> > > > With strict dependencies:
> > > > // If the input pointer is tainted, the output pointer would be tainted 
> > > > (because it is derived from the input pointer - maybe the same value).
> > > > taint(s[0]) == dfsan_read_label(s, sizeof(s)) > taint(ret) == 
> > > > ret_label[0]
> > > > 
> > > > // If the input data is tainted, the output data would be tainted 
> > > > (because it is derived from the input data).
> > > > taint(s[0][0]) == MEM_TO_SHADOW(s[0])[0] > taint(ret[0]) == 
> > > > MEM_TO_SHADOW(ret)[0]
> > > > 
> > > > Because s[0] == ret  (or ret==null), (for the non-null case) the output 
> > > > shadow bytes are the same bytes as input shadow bytes and so these 
> > > > taint labels for the string data in shadow memory do not need to be 
> > > > explicitly propagated in this function. 
> > > > 
> > > > I think the only case actually changing/copying string data is writing 
> > > > a delimiter byte to NULL, which you handled.
> > > I am working on the changes and I came across a strange behavior that I 
> > > would like to ask about.
> > > 
> > > It turned out that if we do
> > > 
> > > char *s = " ... ";
> > > dfsan_set_label(label, _s, sizeof(_s));
> > > 
> > > Then, the s_label within the handler is 0, not "label". This is 
> > > unexpected, as we would like the pointer itself to be labeled here.
> > > I believe this has something to do with the fact that the input string in 
> > > strsep is a double pointer. For example this works as expected for the 
> > > delimiter string, which is a single pointer. 
> > > It's either I'm doing something incorrectly or there is some issue with 
> > > propagating labels for double pointers.
> > > Have you perhaps come across this behavior before?
> > I'm not sure what p_s is in your example. Could you provide a more complete 
> > example?
> > (and maybe all in one function, not half in __dfsw_strsep and half in 
> > another function)
> > 
> > Here is an example showing taint labels at different levels of indirection:
> > 
> > ```
> > #include 
> > #include 
> > #include 
> > 
> > int main(void) {
> >   char *s = " ... ";
> >   char **p_s = 
> >   char ***pp_s = _s;
> > 
> >   dfsan_label i_label = 1 << 0;
> >   dfsan_label j_label = 1 << 1;
> >   dfsan_label k_label = 1 << 2;
> >   dfsan_label m_label = 1 << 3;
> > 
> >   // string data
> >   dfsan_set_label(i_label, s, strlen(s));
> >   // pointer s
> >   dfsan_set_label(j_label, , sizeof(s));
> >   // pointer to pointer s
> >   dfsan_set_label(k_label, _s, sizeof(p_s));
> >   // pointer to pointer to pointer s
> >   dfsan_set_label(m_label, _s, sizeof(pp_s));
> > 
> >   assert(pp_s[0][0] == s);
> > 
> >   // string data
> >   assert(dfsan_read_label(s, strlen(s)) == i_label);
> >   // pointer s
> >   assert(dfsan_read_label(, sizeof(s)) == j_label);
> >   // pointer to pointer s
> >   assert(dfsan_read_label(_s, sizeof(p_s)) == k_label);
> >   // pointer to pointer to pointer s
> >   assert(dfsan_read_label(_s, sizeof(pp_s)) == m_label);
> > 
> >   return 0;
> > }
> > ```
> Hello,
> 
> Thank you for the comment.
> 
> I should have provided a more complete example.
> What I meant is a behavior I've found while working on the tests.
> In the test file I have something like that:
> 
> ```
> char *s = "Hello world/";
> char *delim = " /";
> dfsan_set_label(label, , sizeof());
> char *rv = strep(, delim);
>  ```
> 
> If I get this right, the line 
> ```
> dfsan_set_label(label, , sizeof());
> 
> ``` 
> should have applied a taint to the `s` 

[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-02-02 Thread Tomasz Kuchta via Phabricator via cfe-commits
tkuchta added inline comments.



Comment at: compiler-rt/lib/dfsan/dfsan_custom.cpp:213
+  char *res = strsep(s, delim);
+  s_label = dfsan_read_label(base, strlen(base));
+  if (res && (res != base)) {

browneee wrote:
> tkuchta wrote:
> > browneee wrote:
> > > tkuchta wrote:
> > > > browneee wrote:
> > > > > The `s_label` represents the taint label for `s` (the pointer).
> > > > > 
> > > > > This line would clobber the taint label of the pointer (`s`) with a 
> > > > > taint label from `s[0][0..n]`.
> > > > > 
> > > > > I think this line should be deleted.
> > > > Agree, s_label represents the taint associated with the **s pointer. 
> > > > However I am now wondering if that is the taint wich we would like to 
> > > > return.
> > > > For example, if we have
> > > > if (flags().strict_data_dependencies) {
> > > > *ret_label = res ? s_label : 0;
> > > > 
> > > > We would taint the return value with the value of the pointer, not the 
> > > > data. It means that if we operate on a string for which the characters 
> > > > are tainted, but the pointer itself isn't, we are likely going to 
> > > > return label 0. My understanding was that we are more concerned with 
> > > > the taint of the data, not the pointer, am I missing something?
> > > > 
> > > Yes, we are usually more concerned with the taint of the data, not the 
> > > pointer.
> > > 
> > > With strict dependencies:
> > > // If the input pointer is tainted, the output pointer would be tainted 
> > > (because it is derived from the input pointer - maybe the same value).
> > > taint(s[0]) == dfsan_read_label(s, sizeof(s)) > taint(ret) == 
> > > ret_label[0]
> > > 
> > > // If the input data is tainted, the output data would be tainted 
> > > (because it is derived from the input data).
> > > taint(s[0][0]) == MEM_TO_SHADOW(s[0])[0] > taint(ret[0]) == 
> > > MEM_TO_SHADOW(ret)[0]
> > > 
> > > Because s[0] == ret  (or ret==null), (for the non-null case) the output 
> > > shadow bytes are the same bytes as input shadow bytes and so these taint 
> > > labels for the string data in shadow memory do not need to be explicitly 
> > > propagated in this function. 
> > > 
> > > I think the only case actually changing/copying string data is writing a 
> > > delimiter byte to NULL, which you handled.
> > I am working on the changes and I came across a strange behavior that I 
> > would like to ask about.
> > 
> > It turned out that if we do
> > 
> > char *s = " ... ";
> > dfsan_set_label(label, _s, sizeof(_s));
> > 
> > Then, the s_label within the handler is 0, not "label". This is unexpected, 
> > as we would like the pointer itself to be labeled here.
> > I believe this has something to do with the fact that the input string in 
> > strsep is a double pointer. For example this works as expected for the 
> > delimiter string, which is a single pointer. 
> > It's either I'm doing something incorrectly or there is some issue with 
> > propagating labels for double pointers.
> > Have you perhaps come across this behavior before?
> I'm not sure what p_s is in your example. Could you provide a more complete 
> example?
> (and maybe all in one function, not half in __dfsw_strsep and half in another 
> function)
> 
> Here is an example showing taint labels at different levels of indirection:
> 
> ```
> #include 
> #include 
> #include 
> 
> int main(void) {
>   char *s = " ... ";
>   char **p_s = 
>   char ***pp_s = _s;
> 
>   dfsan_label i_label = 1 << 0;
>   dfsan_label j_label = 1 << 1;
>   dfsan_label k_label = 1 << 2;
>   dfsan_label m_label = 1 << 3;
> 
>   // string data
>   dfsan_set_label(i_label, s, strlen(s));
>   // pointer s
>   dfsan_set_label(j_label, , sizeof(s));
>   // pointer to pointer s
>   dfsan_set_label(k_label, _s, sizeof(p_s));
>   // pointer to pointer to pointer s
>   dfsan_set_label(m_label, _s, sizeof(pp_s));
> 
>   assert(pp_s[0][0] == s);
> 
>   // string data
>   assert(dfsan_read_label(s, strlen(s)) == i_label);
>   // pointer s
>   assert(dfsan_read_label(, sizeof(s)) == j_label);
>   // pointer to pointer s
>   assert(dfsan_read_label(_s, sizeof(p_s)) == k_label);
>   // pointer to pointer to pointer s
>   assert(dfsan_read_label(_s, sizeof(pp_s)) == m_label);
> 
>   return 0;
> }
> ```
Hello,

Thank you for the comment.

I should have provided a more complete example.
What I meant is a behavior I've found while working on the tests.
In the test file I have something like that:

```
char *s = "Hello world/";
char *delim = " /";
dfsan_set_label(label, , sizeof());
char *rv = strep(, delim);
 ```

If I get this right, the line 
```
dfsan_set_label(label, , sizeof());

``` 
should have applied a taint to the `s` pointer.
However, when inside `strsep`, if we check the `s_label`, it's `0`, not `label`

```
SANITIZER_INTERFACE_ATTRIBUTE char *__dfsw_strsep(char **s, const char *delim,
  dfsan_label s_label,
   

[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-02-02 Thread Andrew via Phabricator via cfe-commits
browneee added inline comments.



Comment at: compiler-rt/lib/dfsan/dfsan_custom.cpp:213
+  char *res = strsep(s, delim);
+  s_label = dfsan_read_label(base, strlen(base));
+  if (res && (res != base)) {

tkuchta wrote:
> browneee wrote:
> > tkuchta wrote:
> > > browneee wrote:
> > > > The `s_label` represents the taint label for `s` (the pointer).
> > > > 
> > > > This line would clobber the taint label of the pointer (`s`) with a 
> > > > taint label from `s[0][0..n]`.
> > > > 
> > > > I think this line should be deleted.
> > > Agree, s_label represents the taint associated with the **s pointer. 
> > > However I am now wondering if that is the taint wich we would like to 
> > > return.
> > > For example, if we have
> > > if (flags().strict_data_dependencies) {
> > > *ret_label = res ? s_label : 0;
> > > 
> > > We would taint the return value with the value of the pointer, not the 
> > > data. It means that if we operate on a string for which the characters 
> > > are tainted, but the pointer itself isn't, we are likely going to return 
> > > label 0. My understanding was that we are more concerned with the taint 
> > > of the data, not the pointer, am I missing something?
> > > 
> > Yes, we are usually more concerned with the taint of the data, not the 
> > pointer.
> > 
> > With strict dependencies:
> > // If the input pointer is tainted, the output pointer would be tainted 
> > (because it is derived from the input pointer - maybe the same value).
> > taint(s[0]) == dfsan_read_label(s, sizeof(s)) > taint(ret) == 
> > ret_label[0]
> > 
> > // If the input data is tainted, the output data would be tainted (because 
> > it is derived from the input data).
> > taint(s[0][0]) == MEM_TO_SHADOW(s[0])[0] > taint(ret[0]) == 
> > MEM_TO_SHADOW(ret)[0]
> > 
> > Because s[0] == ret  (or ret==null), (for the non-null case) the output 
> > shadow bytes are the same bytes as input shadow bytes and so these taint 
> > labels for the string data in shadow memory do not need to be explicitly 
> > propagated in this function. 
> > 
> > I think the only case actually changing/copying string data is writing a 
> > delimiter byte to NULL, which you handled.
> I am working on the changes and I came across a strange behavior that I would 
> like to ask about.
> 
> It turned out that if we do
> 
> char *s = " ... ";
> dfsan_set_label(label, _s, sizeof(_s));
> 
> Then, the s_label within the handler is 0, not "label". This is unexpected, 
> as we would like the pointer itself to be labeled here.
> I believe this has something to do with the fact that the input string in 
> strsep is a double pointer. For example this works as expected for the 
> delimiter string, which is a single pointer. 
> It's either I'm doing something incorrectly or there is some issue with 
> propagating labels for double pointers.
> Have you perhaps come across this behavior before?
I'm not sure what p_s is in your example. Could you provide a more complete 
example?
(and maybe all in one function, not half in __dfsw_strsep and half in another 
function)

Here is an example showing taint labels at different levels of indirection:

```
#include 
#include 
#include 

int main(void) {
  char *s = " ... ";
  char **p_s = 
  char ***pp_s = _s;

  dfsan_label i_label = 1 << 0;
  dfsan_label j_label = 1 << 1;
  dfsan_label k_label = 1 << 2;
  dfsan_label m_label = 1 << 3;

  // string data
  dfsan_set_label(i_label, s, strlen(s));
  // pointer s
  dfsan_set_label(j_label, , sizeof(s));
  // pointer to pointer s
  dfsan_set_label(k_label, _s, sizeof(p_s));
  // pointer to pointer to pointer s
  dfsan_set_label(m_label, _s, sizeof(pp_s));

  assert(pp_s[0][0] == s);

  // string data
  assert(dfsan_read_label(s, strlen(s)) == i_label);
  // pointer s
  assert(dfsan_read_label(, sizeof(s)) == j_label);
  // pointer to pointer s
  assert(dfsan_read_label(_s, sizeof(p_s)) == k_label);
  // pointer to pointer to pointer s
  assert(dfsan_read_label(_s, sizeof(pp_s)) == m_label);

  return 0;
}
```


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-01-30 Thread Tomasz Kuchta via Phabricator via cfe-commits
tkuchta added inline comments.



Comment at: compiler-rt/lib/dfsan/dfsan_custom.cpp:213
+  char *res = strsep(s, delim);
+  s_label = dfsan_read_label(base, strlen(base));
+  if (res && (res != base)) {

browneee wrote:
> tkuchta wrote:
> > browneee wrote:
> > > The `s_label` represents the taint label for `s` (the pointer).
> > > 
> > > This line would clobber the taint label of the pointer (`s`) with a taint 
> > > label from `s[0][0..n]`.
> > > 
> > > I think this line should be deleted.
> > Agree, s_label represents the taint associated with the **s pointer. 
> > However I am now wondering if that is the taint wich we would like to 
> > return.
> > For example, if we have
> > if (flags().strict_data_dependencies) {
> > *ret_label = res ? s_label : 0;
> > 
> > We would taint the return value with the value of the pointer, not the 
> > data. It means that if we operate on a string for which the characters are 
> > tainted, but the pointer itself isn't, we are likely going to return label 
> > 0. My understanding was that we are more concerned with the taint of the 
> > data, not the pointer, am I missing something?
> > 
> Yes, we are usually more concerned with the taint of the data, not the 
> pointer.
> 
> With strict dependencies:
> // If the input pointer is tainted, the output pointer would be tainted 
> (because it is derived from the input pointer - maybe the same value).
> taint(s[0]) == dfsan_read_label(s, sizeof(s)) > taint(ret) == ret_label[0]
> 
> // If the input data is tainted, the output data would be tainted (because it 
> is derived from the input data).
> taint(s[0][0]) == MEM_TO_SHADOW(s[0])[0] > taint(ret[0]) == 
> MEM_TO_SHADOW(ret)[0]
> 
> Because s[0] == ret  (or ret==null), (for the non-null case) the output 
> shadow bytes are the same bytes as input shadow bytes and so these taint 
> labels for the string data in shadow memory do not need to be explicitly 
> propagated in this function. 
> 
> I think the only case actually changing/copying string data is writing a 
> delimiter byte to NULL, which you handled.
I am working on the changes and I came across a strange behavior that I would 
like to ask about.

It turned out that if we do

char *s = " ... ";
dfsan_set_label(label, _s, sizeof(_s));

Then, the s_label within the handler is 0, not "label". This is unexpected, as 
we would like the pointer itself to be labeled here.
I believe this has something to do with the fact that the input string in 
strsep is a double pointer. For example this works as expected for the 
delimiter string, which is a single pointer. 
It's either I'm doing something incorrectly or there is some issue with 
propagating labels for double pointers.
Have you perhaps come across this behavior before?


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-01-17 Thread Andrew via Phabricator via cfe-commits
browneee added inline comments.



Comment at: compiler-rt/lib/dfsan/dfsan_custom.cpp:213
+  char *res = strsep(s, delim);
+  s_label = dfsan_read_label(base, strlen(base));
+  if (res && (res != base)) {

tkuchta wrote:
> browneee wrote:
> > The `s_label` represents the taint label for `s` (the pointer).
> > 
> > This line would clobber the taint label of the pointer (`s`) with a taint 
> > label from `s[0][0..n]`.
> > 
> > I think this line should be deleted.
> Agree, s_label represents the taint associated with the **s pointer. However 
> I am now wondering if that is the taint wich we would like to return.
> For example, if we have
> if (flags().strict_data_dependencies) {
> *ret_label = res ? s_label : 0;
> 
> We would taint the return value with the value of the pointer, not the data. 
> It means that if we operate on a string for which the characters are tainted, 
> but the pointer itself isn't, we are likely going to return label 0. My 
> understanding was that we are more concerned with the taint of the data, not 
> the pointer, am I missing something?
> 
Yes, we are usually more concerned with the taint of the data, not the pointer.

With strict dependencies:
// If the input pointer is tainted, the output pointer would be tainted 
(because it is derived from the input pointer - maybe the same value).
taint(s[0]) == dfsan_read_label(s, sizeof(s)) > taint(ret) == ret_label[0]

// If the input data is tainted, the output data would be tainted (because it 
is derived from the input data).
taint(s[0][0]) == MEM_TO_SHADOW(s[0])[0] > taint(ret[0]) == 
MEM_TO_SHADOW(ret)[0]

Because s[0] == ret  (or ret==null), (for the non-null case) the output shadow 
bytes are the same bytes as input shadow bytes and so these taint labels for 
the string data in shadow memory do not need to be explicitly propagated in 
this function. 

I think the only case actually changing/copying string data is writing a 
delimiter byte to NULL, which you handled.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-01-16 Thread Tomasz Kuchta via Phabricator via cfe-commits
tkuchta added inline comments.



Comment at: compiler-rt/lib/dfsan/dfsan_custom.cpp:213
+  char *res = strsep(s, delim);
+  s_label = dfsan_read_label(base, strlen(base));
+  if (res && (res != base)) {

browneee wrote:
> The `s_label` represents the taint label for `s` (the pointer).
> 
> This line would clobber the taint label of the pointer (`s`) with a taint 
> label from `s[0][0..n]`.
> 
> I think this line should be deleted.
Agree, s_label represents the taint associated with the **s pointer. However I 
am now wondering if that is the taint wich we would like to return.
For example, if we have
if (flags().strict_data_dependencies) {
*ret_label = res ? s_label : 0;

We would taint the return value with the value of the pointer, not the data. It 
means that if we operate on a string for which the characters are tainted, but 
the pointer itself isn't, we are likely going to return label 0. My 
understanding was that we are more concerned with the taint of the data, not 
the pointer, am I missing something?




Comment at: compiler-rt/lib/dfsan/dfsan_custom.cpp:214
+  s_label = dfsan_read_label(base, strlen(base));
+  if (res && (res != base)) {
+char *token_start = res;

browneee wrote:
> I think `res && (res != base)` is never true.
> 
> Checking an implementation of strsep 
> (http://git.musl-libc.org/cgit/musl/tree/src/string/strsep.c)
> `res` can either be `NULL` or the same value as `base`.
> 
> I think this line should be `(res != *s)`. This is different because `*s` may 
> be changed by the call to `strsep` just above.
good point, my bad


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-01-11 Thread Andrew via Phabricator via cfe-commits
browneee added a comment.

Please send separate changes one by one (e.g. first patch would be for `strsep` 
alone and would include several functions of implementation `__dfsw_strsep` + 
`__dfso_strsep` + test code for this).


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-01-11 Thread Tomasz Kuchta via Phabricator via cfe-commits
tkuchta added a comment.

Hello browneee,
Thank you very much for the comments! I will take a closer look at the issue 
you pointed out.
Do you prefer me to send a couple of diffs (one per each function) at once or 
one by one?

Kind regards,


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-01-10 Thread Andrew via Phabricator via cfe-commits
browneee requested changes to this revision.
browneee added a comment.
This revision now requires changes to proceed.

Hello! Thank you for the patch.

I think your changes for `__dfsw_strsep` are conflating the taint value of a 
pointer with the taint value of the value(s) it points to.
This is subtle - I believe I made the same mistake when I first worked with the 
DFSan code :)

With this in mind, please check the other functions.

Since this is a large patch and string manipulations need careful review, 
please send separate changes for each group of functions (e.g. first patch is 
`__dfsw_strsep` + `__dfso_strsep` + tests for this code).




Comment at: compiler-rt/lib/dfsan/dfsan_custom.cpp:213
+  char *res = strsep(s, delim);
+  s_label = dfsan_read_label(base, strlen(base));
+  if (res && (res != base)) {

The `s_label` represents the taint label for `s` (the pointer).

This line would clobber the taint label of the pointer (`s`) with a taint label 
from `s[0][0..n]`.

I think this line should be deleted.



Comment at: compiler-rt/lib/dfsan/dfsan_custom.cpp:214
+  s_label = dfsan_read_label(base, strlen(base));
+  if (res && (res != base)) {
+char *token_start = res;

I think `res && (res != base)` is never true.

Checking an implementation of strsep 
(http://git.musl-libc.org/cgit/musl/tree/src/string/strsep.c)
`res` can either be `NULL` or the same value as `base`.

I think this line should be `(res != *s)`. This is different because `*s` may 
be changed by the call to `strsep` just above.



Comment at: compiler-rt/lib/dfsan/dfsan_custom.cpp:239
+  char *base = *s;
+  s_origin = dfsan_read_origin_of_first_taint(base, strlen(base) + 1);
+  char *res = __dfsw_strsep(s, delim, s_label, delim_label, ret_label);

Delete this line.

Like above, `s_origin` represents the origin value for the pointer `s`, not the 
origin value for for data in `base[0..n]`.



Comment at: compiler-rt/test/dfsan/custom.cpp:1693
+
+  dfsan_set_label(n_label, p_delim, sizeof(p_delim));
+

Also
dfsan_set_label(, _delim, sizeof(_delim));



Comment at: compiler-rt/test/dfsan/custom.cpp:1698
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_ZERO_LABEL(rv);
+#else

Also
ASSERT_READ_ZERO_LABEL(rv, strlen(rv))



Comment at: compiler-rt/test/dfsan/custom.cpp:1700
+#else
+  ASSERT_LABEL(rv, n_label);
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, *p_delim);

ASSERT_LABEL(rv, dfsan_union(some_different_label_x, n_label));



Comment at: compiler-rt/test/dfsan/custom.cpp:1704
+
+  dfsan_set_label(m_label, p_s, sizeof(p_s));
+  rv = strsep(_s, p_delim);

Also
dfsan_set_label(, _s, sizeof(_s));



Comment at: compiler-rt/test/dfsan/custom.cpp:1709
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_LABEL(rv, m_label);
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, base[6]);

Change this to

ASSERT_READ_LABEL(rv, strlen(rv), m_label);
ASSERT_LABEL(rv, some_different_label_y);



Comment at: compiler-rt/test/dfsan/custom.cpp:1712
+#else
+  ASSERT_LABEL(rv, dfsan_union(m_label, n_label));
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, base[6]);

some_different_label_y would also be set on rv.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-01-10 Thread Tomasz Kuchta via Phabricator via cfe-commits
tkuchta updated this revision to Diff 487936.
tkuchta added a comment.

applied newer version of clang-format


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D141389/new/

https://reviews.llvm.org/D141389

Files:
  compiler-rt/lib/dfsan/dfsan_custom.cpp
  compiler-rt/lib/dfsan/done_abilist.txt
  compiler-rt/test/dfsan/custom.cpp

Index: compiler-rt/test/dfsan/custom.cpp
===
--- compiler-rt/test/dfsan/custom.cpp
+++ compiler-rt/test/dfsan/custom.cpp
@@ -364,6 +364,47 @@
   ASSERT_LABEL(dst[11], j_label);
 }
 
+void test_strncat() {
+  char src[] = "world";
+  int volatile x = 0; // buffer to ensure src and dst do not share origins
+  (void)x;
+  char dst[] = "hello \0";
+  int volatile y = 0; // buffer to ensure dst and p do not share origins
+  (void)y;
+  char *p = dst;
+  dfsan_set_label(k_label, , sizeof(p));
+  dfsan_set_label(i_label, src, sizeof(src));
+  dfsan_set_label(j_label, dst, sizeof(dst));
+  dfsan_origin dst_o = dfsan_get_origin((long)dst[0]);
+  (void)dst_o;
+  char *ret = strncat(p, src, strlen(src));
+  ASSERT_LABEL(ret, k_label);
+  ASSERT_EQ_ORIGIN(ret, p);
+  assert(ret == dst);
+  assert(strcmp(src, dst + 6) == 0);
+  // Origins are assigned for every 4 contiguous 4-aligned bytes. After
+  // appending src to dst, origins of src can overwrite origins of dst if their
+  // application adddresses are within [start_aligned_down, end_aligned_up).
+  // Other origins are not changed.
+  char *start_aligned_down = (char *)(((size_t)(dst + 6)) & ~3UL);
+  char *end_aligned_up = (char *)(((size_t)(dst + 11 + 4)) & ~3UL);
+  for (int i = 0; i < 12; ++i) {
+if (dst + i < start_aligned_down || dst + i >= end_aligned_up) {
+  ASSERT_INIT_ORIGIN([i], dst_o);
+} else {
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN([i], src[0]);
+}
+  }
+  for (int i = 0; i < 6; ++i) {
+ASSERT_LABEL(dst[i], j_label);
+  }
+  for (int i = 6; i < strlen(dst); ++i) {
+ASSERT_LABEL(dst[i], i_label);
+assert(dfsan_get_label(dst[i]) == dfsan_get_label(src[i - 6]));
+  }
+  ASSERT_LABEL(dst[11], j_label);
+}
+
 void test_strlen() {
   char str1[] = "str1";
   dfsan_set_label(i_label, [3], 1);
@@ -378,6 +419,20 @@
 #endif
 }
 
+void test_strnlen() {
+  char str1[] = "str1";
+  dfsan_set_label(i_label, [3], 1);
+
+  int rv = strnlen(str1, 4);
+  assert(rv == 4);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_ZERO_LABEL(rv);
+#else
+  ASSERT_LABEL(rv, i_label);
+  ASSERT_EQ_ORIGIN(rv, str1[3]);
+#endif
+}
+
 void test_strdup() {
   char str1[] = "str1";
   dfsan_set_label(i_label, [3], 1);
@@ -1627,6 +1682,77 @@
 #endif
 }
 
+void test_strsep() {
+  char *s = strdup("Hello world/");
+  char *delim = strdup(" /");
+
+  char *p_s = s;
+  char *base = s;
+  char *p_delim = delim;
+
+  dfsan_set_label(n_label, p_delim, sizeof(p_delim));
+
+  char *rv = strsep(_s, p_delim);
+  assert(rv == [0]);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_ZERO_LABEL(rv);
+#else
+  ASSERT_LABEL(rv, n_label);
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, *p_delim);
+#endif
+
+  dfsan_set_label(m_label, p_s, sizeof(p_s));
+  rv = strsep(_s, p_delim);
+
+  assert(rv == [6]);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_LABEL(rv, m_label);
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, base[6]);
+#else
+  ASSERT_LABEL(rv, dfsan_union(m_label, n_label));
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, base[6]);
+#endif
+
+  free(s);
+  s = strdup("Hello world/");
+  base = s;
+  free(delim);
+  delim = strdup(" /");
+  p_delim = delim;
+  dfsan_set_label(i_label, [7], 1);
+  dfsan_set_label(j_label, [0], 1);
+
+  rv = strsep(, delim);
+  assert(rv == [0]);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_ZERO_LABEL(rv);
+#else
+  ASSERT_LABEL(rv, j_label);
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, delim[1]);
+#endif
+
+  char *ps = s;
+  dfsan_set_label(dfsan_union(j_label, dfsan_read_label(ps, strlen(ps))), ps,
+  strlen(ps));
+  rv = strsep(, " /");
+  assert(rv == [6]);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_LABEL(rv, i_j_label);
+#else
+  ASSERT_LABEL(rv, i_j_label);
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, base[6]);
+#endif
+  rv = strsep(, " /");
+  assert(strlen(rv) == 0);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_ZERO_LABEL(ps);
+#else
+  ASSERT_ZERO_LABEL(rv);
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, 0);
+
+#endif
+}
+
 void test_memchr() {
   char str1[] = "str1";
   dfsan_set_label(i_label, [3], 1);
@@ -1968,6 +2094,138 @@
   ASSERT_LABEL(r, 0);
 }
 
+template 
+void test_sscanf_chunk(T expected, const char *format, const char *input) {
+  char padded_input[512];
+  strcpy(padded_input, "foo ");
+  strcat(padded_input, input);
+  strcat(padded_input, " bar");
+
+  char padded_format[512];
+  strcpy(padded_format, "foo ");
+  strcat(padded_format, format);
+  strcat(padded_format, " bar");
+
+  // Non labelled arg.
+  T arg;
+  memset(, 0, sizeof(arg));
+  sscanf(padded_input, padded_format, );
+  assert(arg == expected);
+  ASSERT_READ_LABEL(, sizeof(arg), 0);
+
+  // Labelled arg.
+  memset(, 0, 

[PATCH] D141389: [DFSAN] Add support for strnlen, strncat, strsep, sscanf and _tolower

2023-01-10 Thread Tomasz Kuchta via Phabricator via cfe-commits
tkuchta created this revision.
tkuchta added reviewers: vitalybuka, glider.
tkuchta added a project: Sanitizers.
Herald added a subscriber: Enna1.
Herald added a project: All.
tkuchta requested review of this revision.

This patch adds support for the following libc functions in DFSAN: strnlen, 
strncat, strsep and sscanf. It further enables _tolower via the options file.


Repository:
  rCRT Compiler Runtime

https://reviews.llvm.org/D141389

Files:
  compiler-rt/lib/dfsan/dfsan_custom.cpp
  compiler-rt/lib/dfsan/done_abilist.txt
  compiler-rt/test/dfsan/custom.cpp

Index: compiler-rt/test/dfsan/custom.cpp
===
--- compiler-rt/test/dfsan/custom.cpp
+++ compiler-rt/test/dfsan/custom.cpp
@@ -364,6 +364,47 @@
   ASSERT_LABEL(dst[11], j_label);
 }
 
+void test_strncat() {
+  char src[] = "world";
+  int volatile x = 0; // buffer to ensure src and dst do not share origins
+  (void)x;
+  char dst[] = "hello \0";
+  int volatile y = 0; // buffer to ensure dst and p do not share origins
+  (void)y;
+  char *p = dst;
+  dfsan_set_label(k_label, , sizeof(p));
+  dfsan_set_label(i_label, src, sizeof(src));
+  dfsan_set_label(j_label, dst, sizeof(dst));
+  dfsan_origin dst_o = dfsan_get_origin((long)dst[0]);
+  (void)dst_o;
+  char *ret = strncat(p, src, strlen(src));
+  ASSERT_LABEL(ret, k_label);
+  ASSERT_EQ_ORIGIN(ret, p);
+  assert(ret == dst);
+  assert(strcmp(src, dst + 6) == 0);
+  // Origins are assigned for every 4 contiguous 4-aligned bytes. After
+  // appending src to dst, origins of src can overwrite origins of dst if their
+  // application adddresses are within [start_aligned_down, end_aligned_up).
+  // Other origins are not changed.
+  char *start_aligned_down = (char *)(((size_t)(dst + 6)) & ~3UL);
+  char *end_aligned_up = (char *)(((size_t)(dst + 11 + 4)) & ~3UL);
+  for (int i = 0; i < 12; ++i) {
+if (dst + i < start_aligned_down || dst + i >= end_aligned_up) {
+  ASSERT_INIT_ORIGIN([i], dst_o);
+} else {
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN([i], src[0]);
+}
+  }
+  for (int i = 0; i < 6; ++i) {
+ASSERT_LABEL(dst[i], j_label);
+  }
+  for (int i = 6; i < strlen(dst); ++i) {
+ASSERT_LABEL(dst[i], i_label);
+assert(dfsan_get_label(dst[i]) == dfsan_get_label(src[i - 6]));
+  }
+  ASSERT_LABEL(dst[11], j_label);
+}
+
 void test_strlen() {
   char str1[] = "str1";
   dfsan_set_label(i_label, [3], 1);
@@ -378,6 +419,20 @@
 #endif
 }
 
+void test_strnlen() {
+  char str1[] = "str1";
+  dfsan_set_label(i_label, [3], 1);
+
+  int rv = strnlen(str1, 4);
+  assert(rv == 4);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_ZERO_LABEL(rv);
+#else
+  ASSERT_LABEL(rv, i_label);
+  ASSERT_EQ_ORIGIN(rv, str1[3]);
+#endif
+}
+
 void test_strdup() {
   char str1[] = "str1";
   dfsan_set_label(i_label, [3], 1);
@@ -1627,6 +1682,77 @@
 #endif
 }
 
+void test_strsep() {
+  char *s = strdup("Hello world/");
+  char *delim = strdup(" /");
+
+  char *p_s = s;
+  char *base = s;
+  char *p_delim = delim;
+
+  dfsan_set_label(n_label, p_delim, sizeof(p_delim));
+
+  char *rv = strsep(_s, p_delim);
+  assert(rv == [0]);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_ZERO_LABEL(rv);
+#else
+  ASSERT_LABEL(rv, n_label);
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, *p_delim);
+#endif
+
+  dfsan_set_label(m_label, p_s, sizeof(p_s));
+  rv = strsep(_s, p_delim);
+
+  assert(rv == [6]);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_LABEL(rv, m_label);
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, base[6]);
+#else
+  ASSERT_LABEL(rv, dfsan_union(m_label, n_label));
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, base[6]);
+#endif
+
+  free(s);
+  s = strdup("Hello world/");
+  base = s;
+  free(delim);
+  delim = strdup(" /");
+  p_delim = delim;
+  dfsan_set_label(i_label, [7], 1);
+  dfsan_set_label(j_label, [0], 1);
+
+  rv = strsep(, delim);
+  assert(rv == [0]);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_ZERO_LABEL(rv);
+#else
+  ASSERT_LABEL(rv, j_label);
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, delim[1]);
+#endif
+
+  char *ps = s;
+  dfsan_set_label(dfsan_union(j_label, dfsan_read_label(ps, strlen(ps))), ps,
+  strlen(ps));
+  rv = strsep(, " /");
+  assert(rv == [6]);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_LABEL(rv, i_j_label);
+#else
+  ASSERT_LABEL(rv, i_j_label);
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, base[6]);
+#endif
+  rv = strsep(, " /");
+  assert(strlen(rv) == 0);
+#ifdef STRICT_DATA_DEPENDENCIES
+  ASSERT_ZERO_LABEL(ps);
+#else
+  ASSERT_ZERO_LABEL(rv);
+  ASSERT_INIT_ORIGIN_EQ_ORIGIN(, 0);
+
+#endif
+}
+
 void test_memchr() {
   char str1[] = "str1";
   dfsan_set_label(i_label, [3], 1);
@@ -1968,6 +2094,138 @@
   ASSERT_LABEL(r, 0);
 }
 
+template 
+void test_sscanf_chunk(T expected, const char *format, const char *input) {
+  char padded_input[512];
+  strcpy(padded_input, "foo ");
+  strcat(padded_input, input);
+  strcat(padded_input, " bar");
+
+  char padded_format[512];
+  strcpy(padded_format, "foo ");
+  strcat(padded_format, format);
+