pitrou commented on code in PR #47034: URL: https://github.com/apache/arrow/pull/47034#discussion_r2197171556
########## dev/archery/archery/integration/datagen.py: ########## @@ -202,25 +202,32 @@ def __init__(self, name, bit_width, *, metadata=None): super().__init__(name, is_signed=True, bit_width=bit_width, nullable=False, metadata=metadata, min_value=1) - def generate_range(self, size, lower, upper, name=None, - include_extremes=False): + def generate_range(self, size, lower, upper, name=None, include_extremes=False): + if size > (1 << self.bit_width) - 1: + raise ValueError( + f"Size {size} exceeds the maximum for bit width {self.bit_width}." + ) rng = np.random.default_rng() - # generate values that are strictly increasing with a min-value of - # 1, but don't go higher than the max signed value for the given - # bit width. We sort the values to ensure they are strictly increasing - # and set replace to False to avoid duplicates, ensuring a valid - # run-ends array. - values = rng.choice(2 ** (self.bit_width - 1) - 1, size=size, replace=False) - values += 1 - values = sorted(values) + if size == 0: + values = [] + runs_count = 0 + elif size == 1: + values = [size] + runs_count = 1 + else: + runs_count = int(size / 2) + values = rng.choice(range(1, size), size=runs_count - 1, replace=False) + values = sorted(values) + values.append(size) + values = list(map(int if self.bit_width < 64 else str, values)) # RunEnds cannot be null, as such self.nullable == False and this # will generate a validity map of all ones. - is_valid = self._make_is_valid(size) + is_valid = self._make_is_valid(runs_count) if name is None: name = self.name - return PrimitiveColumn(name, size, is_valid, values) + return PrimitiveColumn(name, int(runs_count), is_valid, values) Review Comment: This seems ok on the face of it, but needs an adaptation on the C++ side if we want the decoded REE array to have a `length` equal to `runs_count`. The alternative would be pass to `runs_count` here, which might also work better for Go and nanoarrow (the two other Arrow implementations that support REE currently). @paleolimbot @zeroshade What do you think? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org