This is an automated email from the ASF dual-hosted git repository. hui pushed a commit to branch research/encoding-reorder in repository https://gitbox.apache.org/repos/asf/iotdb.git
commit f3c12667bed347c19057e687b5ebfb93f2356368 Author: xjz17 <[email protected]> AuthorDate: Tue Nov 14 23:43:04 2023 +0800 update --- .../apache/iotdb/tsfile/encoding/EncodeTest.java | 68 +++--- .../tsfile/encoding/KernelDensityEstimation.java | 240 ++++++++++++--------- .../tsfile/encoding/REGERCompress1ArrayTest.java | 2 +- 3 files changed, 172 insertions(+), 138 deletions(-) diff --git a/iotdb-core/tsfile/src/test/java/org/apache/iotdb/tsfile/encoding/EncodeTest.java b/iotdb-core/tsfile/src/test/java/org/apache/iotdb/tsfile/encoding/EncodeTest.java index 8a9b1588729..ba564045dd9 100644 --- a/iotdb-core/tsfile/src/test/java/org/apache/iotdb/tsfile/encoding/EncodeTest.java +++ b/iotdb-core/tsfile/src/test/java/org/apache/iotdb/tsfile/encoding/EncodeTest.java @@ -30,9 +30,13 @@ public class EncodeTest { // "C:\\Users\\xiaoj\\Documents\\GitHub\\encoding-reorder\\vldb\\compression_ratio\\sota_ratio"; // // String parent_dir = "C:\\Users\\Jinnsjao Shawl\\Documents\\GitHub\\encoding-reorder\\"; - String parent_dir = "E:\\vldb-reorder\\encoding-reorder\\"; - String output_parent_dir = parent_dir + "vldb\\compression_ratio\\sota_ratio"; - String input_parent_dir = parent_dir + "reorder\\iotdb_test_small\\"; +// String parent_dir = "E:\\vldb-reorder\\encoding-reorder\\"; +// String output_parent_dir = parent_dir + "vldb\\compression_ratio\\sota_ratio"; + + String parent_dir = "/Users/xiaojinzhao/Documents/GitHub/iotdb/iotdb-core/tsfile/src/test/resources/"; + String output_parent_dir = "/Users/xiaojinzhao/Documents/GitHub/encoding-reorder/compression_ratio/reger_remove_value"; + String input_parent_dir = parent_dir + "trans_data/"; + ArrayList<String> input_path_list = new ArrayList<>(); ArrayList<String> output_path_list = new ArrayList<>(); ArrayList<String> dataset_name = new ArrayList<>(); @@ -55,41 +59,29 @@ public class EncodeTest { input_path_list.add(input_parent_dir + dataset_name.get(i)); } - output_path_list.add(output_parent_dir + "\\CS-Sensors_ratio.csv"); // 0 - dataset_block_size.add(1024); - // dataset_k.add(5); - output_path_list.add(output_parent_dir + "\\Metro-Traffic_ratio.csv"); // 1 - dataset_block_size.add(512); - // dataset_k.add(7); - output_path_list.add(output_parent_dir + "\\USGS-Earthquakes_ratio.csv"); // 2 - dataset_block_size.add(512); - // dataset_k.add(7); - output_path_list.add(output_parent_dir + "\\YZ-Electricity_ratio.csv"); // 3 - dataset_block_size.add(512); - // dataset_k.add(1); - output_path_list.add(output_parent_dir + "\\GW-Magnetic_ratio.csv"); // 4 - dataset_block_size.add(128); - // dataset_k.add(6); - output_path_list.add(output_parent_dir + "\\TY-Fuel_ratio.csv"); // 5 - dataset_block_size.add(64); - // dataset_k.add(5); - output_path_list.add(output_parent_dir + "\\Cyber-Vehicle_ratio.csv"); // 6 - dataset_block_size.add(128); - // dataset_k.add(4); - output_path_list.add(output_parent_dir + "\\Vehicle-Charge_ratio.csv"); // 7 - dataset_block_size.add(512); - // dataset_k.add(8); - output_path_list.add(output_parent_dir + "\\Nifty-Stocks_ratio.csv"); // 8 - dataset_block_size.add(256); - // dataset_k.add(1); - output_path_list.add(output_parent_dir + "\\TH-Climate_ratio.csv"); // 9 - dataset_block_size.add(512); - // dataset_k.add(2); - output_path_list.add(output_parent_dir + "\\TY-Transport_ratio.csv"); // 10 - dataset_block_size.add(512); - // dataset_k.add(9); - output_path_list.add(output_parent_dir + "\\EPM-Education_ratio.csv"); // 11 - dataset_block_size.add(512); + output_path_list.add(output_parent_dir + "/CS-Sensors_ratio.csv"); // 0 +// dataset_block_size.add(128); + + output_path_list.add(output_parent_dir + "/Metro-Traffic_ratio.csv");// 1 +// dataset_block_size.add(4096); + output_path_list.add(output_parent_dir + "/USGS-Earthquakes_ratio.csv");// 2 +// dataset_block_size.add(8192); + output_path_list.add(output_parent_dir + "/YZ-Electricity_ratio.csv"); // 3 + output_path_list.add(output_parent_dir + "/GW-Magnetic_ratio.csv"); //4 + output_path_list.add(output_parent_dir + "/TY-Fuel_ratio.csv");//5 +// dataset_block_size.add(8192); + output_path_list.add(output_parent_dir + "/Cyber-Vehicle_ratio.csv"); //6 +// dataset_block_size.add(2048); + output_path_list.add(output_parent_dir + "/Vehicle-Charge_ratio.csv");//7 +// dataset_block_size.add(2048); + output_path_list.add(output_parent_dir + "/Nifty-Stocks_ratio.csv");//8 +// dataset_block_size.add(128); + output_path_list.add(output_parent_dir + "/TH-Climate_ratio.csv");//9 +// dataset_block_size.add(64); + output_path_list.add(output_parent_dir + "/TY-Transport_ratio.csv");//10 +// dataset_block_size.add(64); + output_path_list.add(output_parent_dir + "/EPM-Education_ratio.csv");//11 +// dataset_block_size.add(256); // for(int file_i=3;file_i<4;file_i++){ for (int file_i = 0; file_i < input_path_list.size(); file_i++) { diff --git a/iotdb-core/tsfile/src/test/java/org/apache/iotdb/tsfile/encoding/KernelDensityEstimation.java b/iotdb-core/tsfile/src/test/java/org/apache/iotdb/tsfile/encoding/KernelDensityEstimation.java index 5fde4ef7aa1..bcd5b0621bf 100644 --- a/iotdb-core/tsfile/src/test/java/org/apache/iotdb/tsfile/encoding/KernelDensityEstimation.java +++ b/iotdb-core/tsfile/src/test/java/org/apache/iotdb/tsfile/encoding/KernelDensityEstimation.java @@ -43,103 +43,145 @@ public class KernelDensityEstimation { // int[] minIndex = findMinIndex(kernelDensity); // System.out.println("Minimum point: x=" + (Arrays.toString(minIndex))); } - } - } - int[] final_minIndex = new int[final_min_count]; - // if(final_min_count>0){ - // final_minIndex[0] = minIndex[0]; - // int pre_value = minIndex[0]; - // for(int mv = 1; mv<final_min_count;mv++){ - // if(minIndex[mv]-pre_value>16){ - // pre_value = minIndex[mv]; - // - // } - // } - System.arraycopy(minIndex, 0, final_minIndex, 0, final_min_count); - // } - return final_minIndex; - } - // public static void main(String[] args) { - // // 数据分布 - // Map<Integer, Integer> data = new HashMap<>(); - // data.put(1, 3); - // data.put(2, 10); - // data.put(3, 100); - // data.put(4, 12); - // if( data.containsKey(10)){ - // System.out.println("contain"); - // } - // if( data.containsKey(1)){ - // System.out.println("contain 1"); - // } - // // 选择带宽 - // double bandwidth = 1.0; - // - // // 计算核密度曲线的极小值点 - // findMinima(data, bandwidth); - // } - // - // static void findMinima(Map<Integer, Integer> data, double bandwidth) { - // // 计算核密度估计 - // Map<Integer, Double> kernelDensityEstimate = calculateKernelDensity(data, bandwidth); - // - // // 计算导数 - // Map<Integer, Double> derivative = calculateDerivative(kernelDensityEstimate); - // - // System.out.println(derivative); - // - // // 打印导数为零的点 - // System.out.println("Minima Points:"); - // for (Map.Entry<Integer, Double> entry : derivative.entrySet()) { - // if (entry.getValue() == 0.0) { - // System.out.println("Point " + entry.getKey()); - // } - // } - // } - // - // private static Map<Integer, Double> calculateKernelDensity(Map<Integer, Integer> data, - // double bandwidth) { - // // 计算核密度估计 - // Map<Integer, Double> kernelDensityEstimate = new HashMap<>(); - // - // for (Map.Entry<Integer, Integer> entry : data.entrySet()) { - // int point = entry.getKey(); - // double sum = 0.0; - // - // for (Map.Entry<Integer, Integer> dataEntry : data.entrySet()) { - // double x = dataEntry.getKey(); - // double kernel = gaussianKernel(x, point, bandwidth); - // sum += kernel; - // } - // - // kernelDensityEstimate.put(point, sum / (data.size() * bandwidth)); - // } - // - // return kernelDensityEstimate; - // } - // - // private static Map<Integer, Double> calculateDerivative(Map<Integer, Double> function) { - // // 计算导数 - // Map<Integer, Double> derivative = new HashMap<>(); - // - // for (Map.Entry<Integer, Double> entry : function.entrySet()) { - // int point = entry.getKey(); - // - // if (point > 1 && point < 4) { - // double derivativeValue = (function.get(point + 1) - function.get(point - 1)) / - // 2.0; - // derivative.put(point, derivativeValue); - // } else { - // // 边缘点处理 - // derivative.put(point, 0.0); - // } - // } - // - // return derivative; - // } - // - // private static double gaussianKernel(double x, double xi, double bandwidth) { - // // 高斯核函数 - // return Math.exp(-0.5 * Math.pow((x - xi) / bandwidth, 2)) / Math.sqrt(2 * Math.PI); - // } + + // 计算核密度估计 + static double[] calculateKernelDensity(Map<Integer, Integer> discreteDistribution) { + int maxKey = discreteDistribution.keySet().stream().max(Integer::compare).orElse(0); + double[] kernelDensity = new double[maxKey]; + + for (int x = 1; x <= maxKey; x++) { + for (Map.Entry<Integer, Integer> entry : discreteDistribution.entrySet()) { + int dataPoint = entry.getKey(); + int weight = entry.getValue(); + kernelDensity[x - 1] += gaussianKernel(x, dataPoint) * weight; + } + } + + return kernelDensity; + } + + // 高斯核函数 + private static double gaussianKernel(int x, int dataPoint) { + double bandwidth = 1.0; // 可调整的带宽参数 + return Math.exp(-0.5 * Math.pow((x - dataPoint) / bandwidth, 2)) / (Math.sqrt(2 * Math.PI) * bandwidth); + } + + // 寻找数组中的最小值索引 + static int[] findMinIndex(double[] array) { + int[] minIndex = new int[array.length]; + int final_min_count = 0; + int pre_value = 0; +// double preValue = array[0]; + + for (int i = 1; i < array.length-1; i++) { + if (array[i] < array[i-1] && array[i] < array[i+1]) { + if(final_min_count != 0){ + if(i>pre_value+32){ + minIndex[final_min_count] = i; + final_min_count ++; + pre_value = i; + } + }else{ + minIndex[final_min_count] = i; + final_min_count ++; + pre_value = i; + } + } + } + int[] final_minIndex = new int[final_min_count]; +// if(final_min_count>0){ +// final_minIndex[0] = minIndex[0]; +// int pre_value = minIndex[0]; +// for(int mv = 1; mv<final_min_count;mv++){ +// if(minIndex[mv]-pre_value>16){ +// pre_value = minIndex[mv]; +// +// } +// } + System.arraycopy(minIndex, 0, final_minIndex, 0, final_min_count); +// } + return final_minIndex; + } +// public static void main(String[] args) { +// // 数据分布 +// Map<Integer, Integer> data = new HashMap<>(); +// data.put(1, 3); +// data.put(2, 10); +// data.put(3, 100); +// data.put(4, 12); +// if( data.containsKey(10)){ +// System.out.println("contain"); +// } +// if( data.containsKey(1)){ +// System.out.println("contain 1"); +// } +// // 选择带宽 +// double bandwidth = 1.0; +// +// // 计算核密度曲线的极小值点 +// findMinima(data, bandwidth); +// } +// +// static void findMinima(Map<Integer, Integer> data, double bandwidth) { +// // 计算核密度估计 +// Map<Integer, Double> kernelDensityEstimate = calculateKernelDensity(data, bandwidth); +// +// // 计算导数 +// Map<Integer, Double> derivative = calculateDerivative(kernelDensityEstimate); +// +// System.out.println(derivative); +// +// // 打印导数为零的点 +// System.out.println("Minima Points:"); +// for (Map.Entry<Integer, Double> entry : derivative.entrySet()) { +// if (entry.getValue() == 0.0) { +// System.out.println("Point " + entry.getKey()); +// } +// } +// } +// +// private static Map<Integer, Double> calculateKernelDensity(Map<Integer, Integer> data, double bandwidth) { +// // 计算核密度估计 +// Map<Integer, Double> kernelDensityEstimate = new HashMap<>(); +// +// for (Map.Entry<Integer, Integer> entry : data.entrySet()) { +// int point = entry.getKey(); +// double sum = 0.0; +// +// for (Map.Entry<Integer, Integer> dataEntry : data.entrySet()) { +// double x = dataEntry.getKey(); +// double kernel = gaussianKernel(x, point, bandwidth); +// sum += kernel; +// } +// +// kernelDensityEstimate.put(point, sum / (data.size() * bandwidth)); +// } +// +// return kernelDensityEstimate; +// } +// +// private static Map<Integer, Double> calculateDerivative(Map<Integer, Double> function) { +// // 计算导数 +// Map<Integer, Double> derivative = new HashMap<>(); +// +// for (Map.Entry<Integer, Double> entry : function.entrySet()) { +// int point = entry.getKey(); +// +// if (point > 1 && point < 4) { +// double derivativeValue = (function.get(point + 1) - function.get(point - 1)) / 2.0; +// derivative.put(point, derivativeValue); +// } else { +// // 边缘点处理 +// derivative.put(point, 0.0); +// } +// } +// +// return derivative; +// } +// +// private static double gaussianKernel(double x, double xi, double bandwidth) { +// // 高斯核函数 +// return Math.exp(-0.5 * Math.pow((x - xi) / bandwidth, 2)) / Math.sqrt(2 * Math.PI); +// } } + diff --git a/iotdb-core/tsfile/src/test/java/org/apache/iotdb/tsfile/encoding/REGERCompress1ArrayTest.java b/iotdb-core/tsfile/src/test/java/org/apache/iotdb/tsfile/encoding/REGERCompress1ArrayTest.java index 4bd952f8665..1c5f025f569 100644 --- a/iotdb-core/tsfile/src/test/java/org/apache/iotdb/tsfile/encoding/REGERCompress1ArrayTest.java +++ b/iotdb-core/tsfile/src/test/java/org/apache/iotdb/tsfile/encoding/REGERCompress1ArrayTest.java @@ -2859,7 +2859,7 @@ public class REGERCompress1ArrayTest { for (String value : dataset_name) { input_path_list.add(input_parent_dir + value); dataset_k.add(1); - dataset_block_size.add(128); + dataset_block_size.add(1024); } output_path_list.add(output_parent_dir + "/CS-Sensors_ratio.csv"); // 0
