nfsantos commented on code in PR #1247:
URL: https://github.com/apache/jackrabbit-oak/pull/1247#discussion_r1432879428


##########
oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/StatsBuilder.java:
##########
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis;
+
+import java.io.IOException;
+
+import org.apache.jackrabbit.oak.commons.Profiler;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySize;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySizeEmbedded;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySizeHistogram;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.DistinctBinarySizeHistogram;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.ListCollector;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeCount;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeTypeCount;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeNameFilter;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.PropertyStats;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.StatsCollector;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.TopLargestBinaries;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeDataReader;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeLineReader;
+
+/**
+ * Builder for commonly used statistics for flat file stores.
+ */
+public class StatsBuilder {
+
+    private static final boolean ONLY_READ = false;
+
+    /**
+     * Read a flat file store and build statistics.
+     *
+     * @param args the file name
+     */
+    public static void main(String... args) throws Exception {
+
+        String fileName = args[0];

Review Comment:
   Add check for number of arguments and print a message if no args are 
provided.



##########
oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/stream/NodeLineReader.java:
##########
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream;
+
+import java.io.BufferedInputStream;
+import java.io.Closeable;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.LineNumberReader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map.Entry;
+
+import javax.jcr.PropertyType;
+
+import org.apache.jackrabbit.oak.commons.PathUtils;
+import org.apache.jackrabbit.oak.commons.json.JsonObject;
+import org.apache.jackrabbit.oak.commons.json.JsopReader;
+import org.apache.jackrabbit.oak.commons.json.JsopTokenizer;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.PropertyValue;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType;
+
+import net.jpountz.lz4.LZ4FrameInputStream;
+
+/**
+ * A reader for flat file stores.
+ */
+public class NodeLineReader implements NodeDataReader, Closeable {
+
+    private final LineNumberReader reader;
+    private final long fileSize;
+    private long lineCount;
+
+    private NodeLineReader(LineNumberReader reader, long fileSize) {
+        this.reader = reader;
+        this.fileSize = fileSize;
+    }
+
+    public static NodeLineReader open(String fileName) throws IOException {
+        long fileSize = new File(fileName).length();
+        InputStream fileIn = new BufferedInputStream(new 
FileInputStream(fileName));
+        try {
+            InputStream in;
+            if (fileName.endsWith(".lz4")) {
+                in = new LZ4FrameInputStream(fileIn);
+            } else {
+                in = fileIn;
+            }
+            LineNumberReader reader = new LineNumberReader(new 
InputStreamReader(in, StandardCharsets.UTF_8));
+            return new NodeLineReader(reader, fileSize);
+        } catch (IOException e) {
+            fileIn.close();
+            throw e;
+        }
+    }
+
+    public NodeData readNode() throws IOException {
+        String line = reader.readLine();
+        if (line == null) {
+            close();
+            return null;
+        }
+        if (++lineCount % 1000000 == 0) {
+            System.out.println(lineCount + " lines");

Review Comment:
   Could this also print a progress report on the number of bytes read and/or 
the percentage of the file processed so far?



##########
oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/StatsBuilder.java:
##########
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis;
+
+import java.io.IOException;
+
+import org.apache.jackrabbit.oak.commons.Profiler;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySize;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySizeEmbedded;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySizeHistogram;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.DistinctBinarySizeHistogram;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.ListCollector;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeCount;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeTypeCount;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeNameFilter;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.PropertyStats;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.StatsCollector;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.TopLargestBinaries;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeDataReader;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeLineReader;
+
+/**
+ * Builder for commonly used statistics for flat file stores.
+ */
+public class StatsBuilder {
+
+    private static final boolean ONLY_READ = false;
+
+    /**
+     * Read a flat file store and build statistics.
+     *
+     * @param args the file name
+     */
+    public static void main(String... args) throws Exception {
+
+        String fileName = args[0];
+        String filter = null;
+        if (args.length > 1) {
+            filter = args[1];
+        }

Review Comment:
   Are you planning on creating a command-line tool somewhere to use this? Or 
integrate in oak-run? It would be nice to have an easy-to-use entry point, with 
documentation and embedded help (`--help`).



##########
oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/StatsBuilder.java:
##########
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis;
+
+import java.io.IOException;
+
+import org.apache.jackrabbit.oak.commons.Profiler;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySize;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySizeEmbedded;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySizeHistogram;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.DistinctBinarySizeHistogram;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.ListCollector;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeCount;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeTypeCount;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeNameFilter;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.PropertyStats;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.StatsCollector;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.TopLargestBinaries;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeDataReader;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeLineReader;
+
+/**
+ * Builder for commonly used statistics for flat file stores.
+ */
+public class StatsBuilder {
+
+    private static final boolean ONLY_READ = false;
+
+    /**
+     * Read a flat file store and build statistics.
+     *
+     * @param args the file name
+     */
+    public static void main(String... args) throws Exception {
+
+        String fileName = args[0];
+        String filter = null;
+        if (args.length > 1) {
+            filter = args[1];
+        }
+
+        ListCollector collectors = new ListCollector();
+        collectors.add(new NodeCount(1000, 1));
+        collectors.add(new BinarySize(100_000_000, 1));
+        collectors.add(new BinarySizeEmbedded(100_000, 1));
+        PropertyStats ps = new PropertyStats(true, 1);
+        collectors.add(ps);
+        collectors.add(new NodeTypeCount());
+        collectors.add(new BinarySizeHistogram(1));
+        collectors.add(new DistinctBinarySizeHistogram(1));
+        collectors.add(new TopLargestBinaries(10));
+        if (filter != null) {
+            collectors.add(new NodeNameFilter(filter, new 
BinarySize(100_000_000, 1)));
+            collectors.add(new NodeNameFilter(filter, new 
BinarySizeEmbedded(100_000, 1)));
+            collectors.add(new NodeNameFilter(filter, new 
BinarySizeHistogram(1)));
+            collectors.add(new NodeNameFilter(filter, new 
TopLargestBinaries(10)));
+        }
+
+        Profiler prof = new Profiler().startCollecting();
+
+        NodeLineReader reader = NodeLineReader.open(fileName);

Review Comment:
   As lines are processed individually, maybe this can be parallelized? I think 
the bottleneck will be CPU, not I/O, so 2 threads should boost speed 
significantly. Or we make the logic so efficient that it become I/O bounded 
even with a single thread.



##########
oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinarySizeEmbedded.java:
##########
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules;
+
+import java.util.Map.Entry;
+import java.util.stream.Collectors;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType;
+
+/**
+ * Collects the total binary size (embedded binaries) per path.
+ */
+public class BinarySizeEmbedded implements StatsCollector {
+
+    private final Storage storage = new Storage();
+    private final int resolution;
+    private final Random random;
+
+    public BinarySizeEmbedded(int resolution, long seed) {
+        this.resolution = resolution;
+        this.random = new Random(seed); //NOSONAR
+    }
+
+    public void add(NodeData node) {
+        long size = 0;
+        for(NodeProperty p : node.getProperties()) {
+            if (p.getType() == ValueType.BINARY) {
+                for (String v : p.getValues()) {
+                    if (!v.startsWith(":blobId:")) {
+                        continue;
+                    }
+                    v = v.substring(":blobId:".length());
+                    if (v.startsWith("0x")) {
+                        // embedded
+                        size = (v.length() - 2) / 2;

Review Comment:
   I found a blob with the following:
   ```blobId:0x89504e470d0a1....e426082#8658```
   So it's embedded and it has the size at the end. The formula above will 
reach the wrong result because it is not subtracting the size suffix. 



##########
oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinarySize.java:
##########
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules;
+
+import java.util.Map.Entry;
+import java.util.stream.Collectors;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType;
+
+/**
+ * Collects the total binary size (references to the datastore) per path.
+ */
+public class BinarySize implements StatsCollector {
+
+    private final Storage storage = new Storage();
+    private final int resolution;
+    private final Random random;
+
+    public BinarySize(int resolution, long seed) {
+        this.resolution = resolution;
+        this.random = new Random(seed); //NOSONAR
+    }
+
+    public void add(NodeData node) {
+        long size = 0;
+        for(NodeProperty p : node.getProperties()) {
+            if (p.getType() == ValueType.BINARY) {
+                for (String v : p.getValues()) {
+                    if (!v.startsWith(":blobId:")) {
+                        continue;
+                    }
+                    v = v.substring(":blobId:".length());
+                    if (v.startsWith("0x")) {
+                        // embedded
+                    } else {
+                        // reference
+                        int hashIndex = v.indexOf('#');

Review Comment:
   `lastIndexOf()` should be a bit faster. 



##########
oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinarySize.java:
##########
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules;
+
+import java.util.Map.Entry;
+import java.util.stream.Collectors;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType;
+
+/**
+ * Collects the total binary size (references to the datastore) per path.
+ */
+public class BinarySize implements StatsCollector {
+
+    private final Storage storage = new Storage();
+    private final int resolution;
+    private final Random random;
+
+    public BinarySize(int resolution, long seed) {
+        this.resolution = resolution;
+        this.random = new Random(seed); //NOSONAR
+    }
+
+    public void add(NodeData node) {
+        long size = 0;
+        for(NodeProperty p : node.getProperties()) {
+            if (p.getType() == ValueType.BINARY) {
+                for (String v : p.getValues()) {
+                    if (!v.startsWith(":blobId:")) {
+                        continue;
+                    }
+                    v = v.substring(":blobId:".length());
+                    if (v.startsWith("0x")) {
+                        // embedded

Review Comment:
   I have seen in some FFS that the embedded binaries also have the size at the 
end. But I don't know if this is always the case.



##########
oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/StatsBuilder.java:
##########
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis;
+
+import java.io.IOException;
+
+import org.apache.jackrabbit.oak.commons.Profiler;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySize;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySizeEmbedded;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySizeHistogram;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.DistinctBinarySizeHistogram;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.ListCollector;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeCount;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeTypeCount;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeNameFilter;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.PropertyStats;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.StatsCollector;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.TopLargestBinaries;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeDataReader;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeLineReader;
+
+/**
+ * Builder for commonly used statistics for flat file stores.
+ */
+public class StatsBuilder {
+
+    private static final boolean ONLY_READ = false;
+
+    /**
+     * Read a flat file store and build statistics.
+     *
+     * @param args the file name
+     */
+    public static void main(String... args) throws Exception {
+
+        String fileName = args[0];
+        String filter = null;
+        if (args.length > 1) {
+            filter = args[1];
+        }
+
+        ListCollector collectors = new ListCollector();
+        collectors.add(new NodeCount(1000, 1));
+        collectors.add(new BinarySize(100_000_000, 1));
+        collectors.add(new BinarySizeEmbedded(100_000, 1));
+        PropertyStats ps = new PropertyStats(true, 1);
+        collectors.add(ps);
+        collectors.add(new NodeTypeCount());
+        collectors.add(new BinarySizeHistogram(1));
+        collectors.add(new DistinctBinarySizeHistogram(1));
+        collectors.add(new TopLargestBinaries(10));
+        if (filter != null) {
+            collectors.add(new NodeNameFilter(filter, new 
BinarySize(100_000_000, 1)));
+            collectors.add(new NodeNameFilter(filter, new 
BinarySizeEmbedded(100_000, 1)));
+            collectors.add(new NodeNameFilter(filter, new 
BinarySizeHistogram(1)));
+            collectors.add(new NodeNameFilter(filter, new 
TopLargestBinaries(10)));
+        }
+
+        Profiler prof = new Profiler().startCollecting();
+

Review Comment:
   Print message saying which file is going to be processed.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: dev-unsubscr...@jackrabbit.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to