nfsantos commented on code in PR #1247: URL: https://github.com/apache/jackrabbit-oak/pull/1247#discussion_r1432879428
########## oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/StatsBuilder.java: ########## @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis; + +import java.io.IOException; + +import org.apache.jackrabbit.oak.commons.Profiler; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySize; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySizeEmbedded; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySizeHistogram; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.DistinctBinarySizeHistogram; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.ListCollector; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeCount; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeTypeCount; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeNameFilter; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.PropertyStats; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.StatsCollector; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.TopLargestBinaries; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeDataReader; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeLineReader; + +/** + * Builder for commonly used statistics for flat file stores. + */ +public class StatsBuilder { + + private static final boolean ONLY_READ = false; + + /** + * Read a flat file store and build statistics. + * + * @param args the file name + */ + public static void main(String... args) throws Exception { + + String fileName = args[0]; Review Comment: Add check for number of arguments and print a message if no args are provided. ########## oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/stream/NodeLineReader.java: ########## @@ -0,0 +1,220 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream; + +import java.io.BufferedInputStream; +import java.io.Closeable; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.LineNumberReader; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.Map.Entry; + +import javax.jcr.PropertyType; + +import org.apache.jackrabbit.oak.commons.PathUtils; +import org.apache.jackrabbit.oak.commons.json.JsonObject; +import org.apache.jackrabbit.oak.commons.json.JsopReader; +import org.apache.jackrabbit.oak.commons.json.JsopTokenizer; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.PropertyValue; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType; + +import net.jpountz.lz4.LZ4FrameInputStream; + +/** + * A reader for flat file stores. + */ +public class NodeLineReader implements NodeDataReader, Closeable { + + private final LineNumberReader reader; + private final long fileSize; + private long lineCount; + + private NodeLineReader(LineNumberReader reader, long fileSize) { + this.reader = reader; + this.fileSize = fileSize; + } + + public static NodeLineReader open(String fileName) throws IOException { + long fileSize = new File(fileName).length(); + InputStream fileIn = new BufferedInputStream(new FileInputStream(fileName)); + try { + InputStream in; + if (fileName.endsWith(".lz4")) { + in = new LZ4FrameInputStream(fileIn); + } else { + in = fileIn; + } + LineNumberReader reader = new LineNumberReader(new InputStreamReader(in, StandardCharsets.UTF_8)); + return new NodeLineReader(reader, fileSize); + } catch (IOException e) { + fileIn.close(); + throw e; + } + } + + public NodeData readNode() throws IOException { + String line = reader.readLine(); + if (line == null) { + close(); + return null; + } + if (++lineCount % 1000000 == 0) { + System.out.println(lineCount + " lines"); Review Comment: Could this also print a progress report on the number of bytes read and/or the percentage of the file processed so far? ########## oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/StatsBuilder.java: ########## @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis; + +import java.io.IOException; + +import org.apache.jackrabbit.oak.commons.Profiler; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySize; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySizeEmbedded; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySizeHistogram; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.DistinctBinarySizeHistogram; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.ListCollector; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeCount; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeTypeCount; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeNameFilter; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.PropertyStats; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.StatsCollector; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.TopLargestBinaries; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeDataReader; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeLineReader; + +/** + * Builder for commonly used statistics for flat file stores. + */ +public class StatsBuilder { + + private static final boolean ONLY_READ = false; + + /** + * Read a flat file store and build statistics. + * + * @param args the file name + */ + public static void main(String... args) throws Exception { + + String fileName = args[0]; + String filter = null; + if (args.length > 1) { + filter = args[1]; + } Review Comment: Are you planning on creating a command-line tool somewhere to use this? Or integrate in oak-run? It would be nice to have an easy-to-use entry point, with documentation and embedded help (`--help`). ########## oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/StatsBuilder.java: ########## @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis; + +import java.io.IOException; + +import org.apache.jackrabbit.oak.commons.Profiler; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySize; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySizeEmbedded; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySizeHistogram; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.DistinctBinarySizeHistogram; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.ListCollector; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeCount; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeTypeCount; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeNameFilter; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.PropertyStats; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.StatsCollector; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.TopLargestBinaries; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeDataReader; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeLineReader; + +/** + * Builder for commonly used statistics for flat file stores. + */ +public class StatsBuilder { + + private static final boolean ONLY_READ = false; + + /** + * Read a flat file store and build statistics. + * + * @param args the file name + */ + public static void main(String... args) throws Exception { + + String fileName = args[0]; + String filter = null; + if (args.length > 1) { + filter = args[1]; + } + + ListCollector collectors = new ListCollector(); + collectors.add(new NodeCount(1000, 1)); + collectors.add(new BinarySize(100_000_000, 1)); + collectors.add(new BinarySizeEmbedded(100_000, 1)); + PropertyStats ps = new PropertyStats(true, 1); + collectors.add(ps); + collectors.add(new NodeTypeCount()); + collectors.add(new BinarySizeHistogram(1)); + collectors.add(new DistinctBinarySizeHistogram(1)); + collectors.add(new TopLargestBinaries(10)); + if (filter != null) { + collectors.add(new NodeNameFilter(filter, new BinarySize(100_000_000, 1))); + collectors.add(new NodeNameFilter(filter, new BinarySizeEmbedded(100_000, 1))); + collectors.add(new NodeNameFilter(filter, new BinarySizeHistogram(1))); + collectors.add(new NodeNameFilter(filter, new TopLargestBinaries(10))); + } + + Profiler prof = new Profiler().startCollecting(); + + NodeLineReader reader = NodeLineReader.open(fileName); Review Comment: As lines are processed individually, maybe this can be parallelized? I think the bottleneck will be CPU, not I/O, so 2 threads should boost speed significantly. Or we make the logic so efficient that it become I/O bounded even with a single thread. ########## oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinarySizeEmbedded.java: ########## @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules; + +import java.util.Map.Entry; +import java.util.stream.Collectors; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType; + +/** + * Collects the total binary size (embedded binaries) per path. + */ +public class BinarySizeEmbedded implements StatsCollector { + + private final Storage storage = new Storage(); + private final int resolution; + private final Random random; + + public BinarySizeEmbedded(int resolution, long seed) { + this.resolution = resolution; + this.random = new Random(seed); //NOSONAR + } + + public void add(NodeData node) { + long size = 0; + for(NodeProperty p : node.getProperties()) { + if (p.getType() == ValueType.BINARY) { + for (String v : p.getValues()) { + if (!v.startsWith(":blobId:")) { + continue; + } + v = v.substring(":blobId:".length()); + if (v.startsWith("0x")) { + // embedded + size = (v.length() - 2) / 2; Review Comment: I found a blob with the following: ```blobId:0x89504e470d0a1....e426082#8658``` So it's embedded and it has the size at the end. The formula above will reach the wrong result because it is not subtracting the size suffix. ########## oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinarySize.java: ########## @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules; + +import java.util.Map.Entry; +import java.util.stream.Collectors; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType; + +/** + * Collects the total binary size (references to the datastore) per path. + */ +public class BinarySize implements StatsCollector { + + private final Storage storage = new Storage(); + private final int resolution; + private final Random random; + + public BinarySize(int resolution, long seed) { + this.resolution = resolution; + this.random = new Random(seed); //NOSONAR + } + + public void add(NodeData node) { + long size = 0; + for(NodeProperty p : node.getProperties()) { + if (p.getType() == ValueType.BINARY) { + for (String v : p.getValues()) { + if (!v.startsWith(":blobId:")) { + continue; + } + v = v.substring(":blobId:".length()); + if (v.startsWith("0x")) { + // embedded + } else { + // reference + int hashIndex = v.indexOf('#'); Review Comment: `lastIndexOf()` should be a bit faster. ########## oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/modules/BinarySize.java: ########## @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules; + +import java.util.Map.Entry; +import java.util.stream.Collectors; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeProperty.ValueType; + +/** + * Collects the total binary size (references to the datastore) per path. + */ +public class BinarySize implements StatsCollector { + + private final Storage storage = new Storage(); + private final int resolution; + private final Random random; + + public BinarySize(int resolution, long seed) { + this.resolution = resolution; + this.random = new Random(seed); //NOSONAR + } + + public void add(NodeData node) { + long size = 0; + for(NodeProperty p : node.getProperties()) { + if (p.getType() == ValueType.BINARY) { + for (String v : p.getValues()) { + if (!v.startsWith(":blobId:")) { + continue; + } + v = v.substring(":blobId:".length()); + if (v.startsWith("0x")) { + // embedded Review Comment: I have seen in some FFS that the embedded binaries also have the size at the end. But I don't know if this is always the case. ########## oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/analysis/StatsBuilder.java: ########## @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis; + +import java.io.IOException; + +import org.apache.jackrabbit.oak.commons.Profiler; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySize; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySizeEmbedded; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.BinarySizeHistogram; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.DistinctBinarySizeHistogram; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.ListCollector; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeCount; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeTypeCount; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.NodeNameFilter; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.PropertyStats; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.StatsCollector; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.modules.TopLargestBinaries; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeData; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeDataReader; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.analysis.stream.NodeLineReader; + +/** + * Builder for commonly used statistics for flat file stores. + */ +public class StatsBuilder { + + private static final boolean ONLY_READ = false; + + /** + * Read a flat file store and build statistics. + * + * @param args the file name + */ + public static void main(String... args) throws Exception { + + String fileName = args[0]; + String filter = null; + if (args.length > 1) { + filter = args[1]; + } + + ListCollector collectors = new ListCollector(); + collectors.add(new NodeCount(1000, 1)); + collectors.add(new BinarySize(100_000_000, 1)); + collectors.add(new BinarySizeEmbedded(100_000, 1)); + PropertyStats ps = new PropertyStats(true, 1); + collectors.add(ps); + collectors.add(new NodeTypeCount()); + collectors.add(new BinarySizeHistogram(1)); + collectors.add(new DistinctBinarySizeHistogram(1)); + collectors.add(new TopLargestBinaries(10)); + if (filter != null) { + collectors.add(new NodeNameFilter(filter, new BinarySize(100_000_000, 1))); + collectors.add(new NodeNameFilter(filter, new BinarySizeEmbedded(100_000, 1))); + collectors.add(new NodeNameFilter(filter, new BinarySizeHistogram(1))); + collectors.add(new NodeNameFilter(filter, new TopLargestBinaries(10))); + } + + Profiler prof = new Profiler().startCollecting(); + Review Comment: Print message saying which file is going to be processed. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
