NightOwl888 commented on code in PR #1154:
URL: https://github.com/apache/lucenenet/pull/1154#discussion_r2038138668
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
/// <summary>
/// Load the datafile into this <see cref="WordDictionary"/>
/// </summary>
- /// <param name="dctFilePath">path to word dictionary
(coredict.dct)</param>
- /// <returns>number of words read</returns>
+ /// <param name="dctFilePath">Path to word dictionary
(coredict.dct)</param>
+ /// <returns>Number of words read</returns>
/// <exception cref="IOException">If there is a low-level I/O
error.</exception>
private int LoadMainDataFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
- // The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
+ int total = 0;
+
+ // The file only counted 6763 Chinese characters plus 5 reserved
slots (3756~3760).
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
+
+ // LUCENENET: Removed buffer and intBuffer arrays since
BinaryReader handles reading values directly in a more type-safe and readable
way.
+ // LUCENENET: Use BinaryReader to simplify endian conversion and
stream reading.
+
using (var dctFile = new FileStream(dctFilePath, FileMode.Open,
FileAccess.Read))
+ using (var reader = new BinaryReader(dctFile))
{
-
// GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
+ for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
{
- // if (i == 5231)
Review Comment:
Please leave upstream comments in place.
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs:
##########
@@ -254,80 +254,83 @@ private void Load(string dictRoot)
/// <summary>
/// Load the datafile into this <see cref="BigramDictionary"/>
/// </summary>
- /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary
(bigramdict.dct)</param>
+ /// <param name="dctFilePath">Path to the Bigramdictionary
(bigramdict.dct)</param>
/// <exception cref="IOException">If there is a low-level I/O
error</exception>
public virtual void LoadFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
// The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
// The 3756th is used (as a header) to store information.
- int[]
Review Comment:
Please add this temporary buffer back in, but allocate it on the stack.
```c#
Span<int> buffer = stackalloc char[3];
```
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs:
##########
@@ -254,80 +254,83 @@ private void Load(string dictRoot)
/// <summary>
/// Load the datafile into this <see cref="BigramDictionary"/>
/// </summary>
- /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary
(bigramdict.dct)</param>
+ /// <param name="dctFilePath">Path to the Bigramdictionary
(bigramdict.dct)</param>
/// <exception cref="IOException">If there is a low-level I/O
error</exception>
public virtual void LoadFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
// The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
+
+ // LUCENENET: Removed buffer and intBuffer arrays since
BinaryReader handles reading values directly in a more type-safe and readable
way.
+ // LUCENENET specific - refactored constants for clarity
+ const int HEADER_POSITION = 3755;
+ const int MAX_VALID_LENGTH = 1000;
+
//using (RandomAccessFile dctFile = new
RandomAccessFile(dctFilePath, "r"))
using var dctFile = new FileStream(dctFilePath, FileMode.Open,
FileAccess.Read);
+ using var reader = new BinaryReader(dctFile);
// GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
+ for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
{
- string currentStr = GetCCByGB2312Id(i);
- // if (i == 5231)
- // System.out.println(i);
Review Comment:
Please leave upstream comments in place.
##########
src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs:
##########
@@ -0,0 +1,72 @@
+using Lucene.Net.Util;
+using Lucene.Net.Analysis.Cn.Smart.Hhmm;
+using Lucene.Net.Attributes;
+using NUnit.Framework;
+using System;
+using System.IO;
+using System.Reflection;
+
Review Comment:
Please use our custom `Assert` class, not the one from NUnit.
```c#
using Assert = Lucene.Net.TestFramework.Assert;
```
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
/// <summary>
/// Load the datafile into this <see cref="WordDictionary"/>
/// </summary>
- /// <param name="dctFilePath">path to word dictionary
(coredict.dct)</param>
- /// <returns>number of words read</returns>
+ /// <param name="dctFilePath">Path to word dictionary
(coredict.dct)</param>
+ /// <returns>Number of words read</returns>
/// <exception cref="IOException">If there is a low-level I/O
error.</exception>
private int LoadMainDataFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
- // The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
+ int total = 0;
+
+ // The file only counted 6763 Chinese characters plus 5 reserved
slots (3756~3760).
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
+
+ // LUCENENET: Removed buffer and intBuffer arrays since
BinaryReader handles reading values directly in a more type-safe and readable
way.
+ // LUCENENET: Use BinaryReader to simplify endian conversion and
stream reading.
+
using (var dctFile = new FileStream(dctFilePath, FileMode.Open,
FileAccess.Read))
+ using (var reader = new BinaryReader(dctFile))
{
-
// GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
+ for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
{
- // if (i == 5231)
- // System.out.println(i);
+ int cnt = reader.ReadInt32(); // LUCENENET: Use
BinaryReader methods instead of ByteBuffer
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- // the dictionary was developed for C, and byte order must
be converted to work with Java
- cnt =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
if (cnt <= 0)
{
wordItem_charArrayTable[i] = null;
wordItem_frequencyTable[i] = null;
continue;
}
+
wordItem_charArrayTable[i] = new char[cnt][];
wordItem_frequencyTable[i] = new int[cnt];
total += cnt;
- int j = 0;
- while (j < cnt)
+
+ for (int j = 0; j < cnt; j++)
{
- // wordItemTable[i][j] = new WordItem();
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[0] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// frequency
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[1] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// length
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[2] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// handle
-
- // wordItemTable[i][j].frequency = buffer[0];
- wordItem_frequencyTable[i][j] = buffer[0];
-
- length = buffer[1];
+ // LUCENENET: Use BinaryReader methods instead of
ByteBuffer
+ int frequency = reader.ReadInt32();
+ int length = reader.ReadInt32();
+ reader.ReadInt32(); // Skip handle (unused)
Review Comment:
Please set this to `buffer[2]` as it was upstream.
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
/// <summary>
/// Load the datafile into this <see cref="WordDictionary"/>
/// </summary>
- /// <param name="dctFilePath">path to word dictionary
(coredict.dct)</param>
- /// <returns>number of words read</returns>
+ /// <param name="dctFilePath">Path to word dictionary
(coredict.dct)</param>
+ /// <returns>Number of words read</returns>
/// <exception cref="IOException">If there is a low-level I/O
error.</exception>
private int LoadMainDataFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
- // The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
+ int total = 0;
+
+ // The file only counted 6763 Chinese characters plus 5 reserved
slots (3756~3760).
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
+
+ // LUCENENET: Removed buffer and intBuffer arrays since
BinaryReader handles reading values directly in a more type-safe and readable
way.
+ // LUCENENET: Use BinaryReader to simplify endian conversion and
stream reading.
+
using (var dctFile = new FileStream(dctFilePath, FileMode.Open,
FileAccess.Read))
+ using (var reader = new BinaryReader(dctFile))
{
-
// GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
+ for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
{
- // if (i == 5231)
- // System.out.println(i);
+ int cnt = reader.ReadInt32(); // LUCENENET: Use
BinaryReader methods instead of ByteBuffer
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- // the dictionary was developed for C, and byte order must
be converted to work with Java
- cnt =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
if (cnt <= 0)
{
wordItem_charArrayTable[i] = null;
wordItem_frequencyTable[i] = null;
continue;
}
+
wordItem_charArrayTable[i] = new char[cnt][];
wordItem_frequencyTable[i] = new int[cnt];
total += cnt;
- int j = 0;
- while (j < cnt)
+
+ for (int j = 0; j < cnt; j++)
{
- // wordItemTable[i][j] = new WordItem();
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[0] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// frequency
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[1] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// length
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[2] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// handle
-
- // wordItemTable[i][j].frequency = buffer[0];
- wordItem_frequencyTable[i][j] = buffer[0];
-
- length = buffer[1];
+ // LUCENENET: Use BinaryReader methods instead of
ByteBuffer
+ int frequency = reader.ReadInt32();
+ int length = reader.ReadInt32();
+ reader.ReadInt32(); // Skip handle (unused)
+
+ wordItem_frequencyTable[i][j] = frequency;
+
if (length > 0)
{
- byte[] lchBuffer = new byte[length];
- dctFile.Read(lchBuffer, 0, lchBuffer.Length);
- tmpword = gb2312Encoding.GetString(lchBuffer); //
LUCENENET specific: use cached encoding instance from base class
+ byte[] lchBuffer = reader.ReadBytes(length);
+ string tmpword =
gb2312Encoding.GetString(lchBuffer); // LUCENENET: Use cached encoding instance
from base class
wordItem_charArrayTable[i][j] =
tmpword.ToCharArray();
}
else
{
- // wordItemTable[i][j].charArray = null;
Review Comment:
Please leave upstream comments in place.
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
/// <summary>
/// Load the datafile into this <see cref="WordDictionary"/>
/// </summary>
- /// <param name="dctFilePath">path to word dictionary
(coredict.dct)</param>
- /// <returns>number of words read</returns>
+ /// <param name="dctFilePath">Path to word dictionary
(coredict.dct)</param>
+ /// <returns>Number of words read</returns>
/// <exception cref="IOException">If there is a low-level I/O
error.</exception>
private int LoadMainDataFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
Review Comment:
Please define the variables the same way as the upstream code. Someday we
may need to merge upstream changes, so it helps a lot if we don't have to
re-evaluate the business logic.
```c#
int i, cnt, length, total = 0;
```
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
/// <summary>
/// Load the datafile into this <see cref="WordDictionary"/>
/// </summary>
- /// <param name="dctFilePath">path to word dictionary
(coredict.dct)</param>
- /// <returns>number of words read</returns>
+ /// <param name="dctFilePath">Path to word dictionary
(coredict.dct)</param>
+ /// <returns>Number of words read</returns>
/// <exception cref="IOException">If there is a low-level I/O
error.</exception>
private int LoadMainDataFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
- // The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
+ int total = 0;
+
+ // The file only counted 6763 Chinese characters plus 5 reserved
slots (3756~3760).
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
+
+ // LUCENENET: Removed buffer and intBuffer arrays since
BinaryReader handles reading values directly in a more type-safe and readable
way.
+ // LUCENENET: Use BinaryReader to simplify endian conversion and
stream reading.
+
using (var dctFile = new FileStream(dctFilePath, FileMode.Open,
FileAccess.Read))
+ using (var reader = new BinaryReader(dctFile))
{
-
// GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
+ for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
{
- // if (i == 5231)
- // System.out.println(i);
+ int cnt = reader.ReadInt32(); // LUCENENET: Use
BinaryReader methods instead of ByteBuffer
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- // the dictionary was developed for C, and byte order must
be converted to work with Java
- cnt =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
if (cnt <= 0)
{
wordItem_charArrayTable[i] = null;
wordItem_frequencyTable[i] = null;
continue;
}
+
wordItem_charArrayTable[i] = new char[cnt][];
wordItem_frequencyTable[i] = new int[cnt];
total += cnt;
- int j = 0;
- while (j < cnt)
+
+ for (int j = 0; j < cnt; j++)
{
- // wordItemTable[i][j] = new WordItem();
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[0] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// frequency
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[1] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// length
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[2] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// handle
-
- // wordItemTable[i][j].frequency = buffer[0];
Review Comment:
Please leave upstream comments in place.
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
/// <summary>
/// Load the datafile into this <see cref="WordDictionary"/>
/// </summary>
- /// <param name="dctFilePath">path to word dictionary
(coredict.dct)</param>
- /// <returns>number of words read</returns>
+ /// <param name="dctFilePath">Path to word dictionary
(coredict.dct)</param>
+ /// <returns>Number of words read</returns>
/// <exception cref="IOException">If there is a low-level I/O
error.</exception>
private int LoadMainDataFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
- // The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
+ int total = 0;
+
+ // The file only counted 6763 Chinese characters plus 5 reserved
slots (3756~3760).
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
+
+ // LUCENENET: Removed buffer and intBuffer arrays since
BinaryReader handles reading values directly in a more type-safe and readable
way.
+ // LUCENENET: Use BinaryReader to simplify endian conversion and
stream reading.
+
using (var dctFile = new FileStream(dctFilePath, FileMode.Open,
FileAccess.Read))
+ using (var reader = new BinaryReader(dctFile))
{
-
// GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
+ for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
{
- // if (i == 5231)
- // System.out.println(i);
+ int cnt = reader.ReadInt32(); // LUCENENET: Use
BinaryReader methods instead of ByteBuffer
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- // the dictionary was developed for C, and byte order must
be converted to work with Java
- cnt =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
if (cnt <= 0)
{
wordItem_charArrayTable[i] = null;
wordItem_frequencyTable[i] = null;
continue;
}
+
wordItem_charArrayTable[i] = new char[cnt][];
wordItem_frequencyTable[i] = new int[cnt];
total += cnt;
- int j = 0;
Review Comment:
Please do not change the variable declarations from the upstream code.
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
/// <summary>
/// Load the datafile into this <see cref="WordDictionary"/>
/// </summary>
- /// <param name="dctFilePath">path to word dictionary
(coredict.dct)</param>
- /// <returns>number of words read</returns>
+ /// <param name="dctFilePath">Path to word dictionary
(coredict.dct)</param>
+ /// <returns>Number of words read</returns>
/// <exception cref="IOException">If there is a low-level I/O
error.</exception>
private int LoadMainDataFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
- // The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
+ int total = 0;
+
+ // The file only counted 6763 Chinese characters plus 5 reserved
slots (3756~3760).
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
+
+ // LUCENENET: Removed buffer and intBuffer arrays since
BinaryReader handles reading values directly in a more type-safe and readable
way.
+ // LUCENENET: Use BinaryReader to simplify endian conversion and
stream reading.
+
using (var dctFile = new FileStream(dctFilePath, FileMode.Open,
FileAccess.Read))
+ using (var reader = new BinaryReader(dctFile))
{
-
// GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
+ for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
{
- // if (i == 5231)
- // System.out.println(i);
Review Comment:
Please leave upstream comments in place.
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
/// <summary>
/// Load the datafile into this <see cref="WordDictionary"/>
/// </summary>
- /// <param name="dctFilePath">path to word dictionary
(coredict.dct)</param>
- /// <returns>number of words read</returns>
+ /// <param name="dctFilePath">Path to word dictionary
(coredict.dct)</param>
+ /// <returns>Number of words read</returns>
/// <exception cref="IOException">If there is a low-level I/O
error.</exception>
private int LoadMainDataFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
- // The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
+ int total = 0;
+
+ // The file only counted 6763 Chinese characters plus 5 reserved
slots (3756~3760).
// The 3756th is used (as a header) to store information.
- int[]
Review Comment:
Please add this temporary buffer back in, but allocate it on the stack.
```c#
Span<int> buffer = stackalloc char[3];
```
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
/// <summary>
/// Load the datafile into this <see cref="WordDictionary"/>
/// </summary>
- /// <param name="dctFilePath">path to word dictionary
(coredict.dct)</param>
- /// <returns>number of words read</returns>
+ /// <param name="dctFilePath">Path to word dictionary
(coredict.dct)</param>
+ /// <returns>Number of words read</returns>
/// <exception cref="IOException">If there is a low-level I/O
error.</exception>
private int LoadMainDataFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
- // The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
+ int total = 0;
+
+ // The file only counted 6763 Chinese characters plus 5 reserved
slots (3756~3760).
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
+
+ // LUCENENET: Removed buffer and intBuffer arrays since
BinaryReader handles reading values directly in a more type-safe and readable
way.
+ // LUCENENET: Use BinaryReader to simplify endian conversion and
stream reading.
+
using (var dctFile = new FileStream(dctFilePath, FileMode.Open,
FileAccess.Read))
+ using (var reader = new BinaryReader(dctFile))
{
-
// GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
+ for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
{
- // if (i == 5231)
- // System.out.println(i);
+ int cnt = reader.ReadInt32(); // LUCENENET: Use
BinaryReader methods instead of ByteBuffer
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- // the dictionary was developed for C, and byte order must
be converted to work with Java
- cnt =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
if (cnt <= 0)
{
wordItem_charArrayTable[i] = null;
wordItem_frequencyTable[i] = null;
continue;
}
+
wordItem_charArrayTable[i] = new char[cnt][];
wordItem_frequencyTable[i] = new int[cnt];
total += cnt;
- int j = 0;
- while (j < cnt)
+
+ for (int j = 0; j < cnt; j++)
{
- // wordItemTable[i][j] = new WordItem();
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[0] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// frequency
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[1] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// length
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[2] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// handle
-
- // wordItemTable[i][j].frequency = buffer[0];
- wordItem_frequencyTable[i][j] = buffer[0];
-
- length = buffer[1];
+ // LUCENENET: Use BinaryReader methods instead of
ByteBuffer
+ int frequency = reader.ReadInt32();
+ int length = reader.ReadInt32();
Review Comment:
Please leave upstream comments in place.
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
/// <summary>
/// Load the datafile into this <see cref="WordDictionary"/>
/// </summary>
- /// <param name="dctFilePath">path to word dictionary
(coredict.dct)</param>
- /// <returns>number of words read</returns>
+ /// <param name="dctFilePath">Path to word dictionary
(coredict.dct)</param>
+ /// <returns>Number of words read</returns>
/// <exception cref="IOException">If there is a low-level I/O
error.</exception>
private int LoadMainDataFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
- // The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
+ int total = 0;
+
+ // The file only counted 6763 Chinese characters plus 5 reserved
slots (3756~3760).
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
+
+ // LUCENENET: Removed buffer and intBuffer arrays since
BinaryReader handles reading values directly in a more type-safe and readable
way.
+ // LUCENENET: Use BinaryReader to simplify endian conversion and
stream reading.
+
using (var dctFile = new FileStream(dctFilePath, FileMode.Open,
FileAccess.Read))
+ using (var reader = new BinaryReader(dctFile))
{
-
// GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
+ for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
{
- // if (i == 5231)
- // System.out.println(i);
+ int cnt = reader.ReadInt32(); // LUCENENET: Use
BinaryReader methods instead of ByteBuffer
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- // the dictionary was developed for C, and byte order must
be converted to work with Java
- cnt =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
if (cnt <= 0)
{
wordItem_charArrayTable[i] = null;
wordItem_frequencyTable[i] = null;
continue;
}
+
wordItem_charArrayTable[i] = new char[cnt][];
wordItem_frequencyTable[i] = new int[cnt];
total += cnt;
- int j = 0;
- while (j < cnt)
Review Comment:
Please do not change the style of loop from the upstream code.
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
/// <summary>
/// Load the datafile into this <see cref="WordDictionary"/>
/// </summary>
- /// <param name="dctFilePath">path to word dictionary
(coredict.dct)</param>
- /// <returns>number of words read</returns>
+ /// <param name="dctFilePath">Path to word dictionary
(coredict.dct)</param>
+ /// <returns>Number of words read</returns>
/// <exception cref="IOException">If there is a low-level I/O
error.</exception>
private int LoadMainDataFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
- // The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
+ int total = 0;
+
+ // The file only counted 6763 Chinese characters plus 5 reserved
slots (3756~3760).
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
+
+ // LUCENENET: Removed buffer and intBuffer arrays since
BinaryReader handles reading values directly in a more type-safe and readable
way.
+ // LUCENENET: Use BinaryReader to simplify endian conversion and
stream reading.
+
using (var dctFile = new FileStream(dctFilePath, FileMode.Open,
FileAccess.Read))
+ using (var reader = new BinaryReader(dctFile))
{
-
// GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
+ for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
{
- // if (i == 5231)
- // System.out.println(i);
+ int cnt = reader.ReadInt32(); // LUCENENET: Use
BinaryReader methods instead of ByteBuffer
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- // the dictionary was developed for C, and byte order must
be converted to work with Java
- cnt =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
if (cnt <= 0)
{
wordItem_charArrayTable[i] = null;
wordItem_frequencyTable[i] = null;
continue;
}
+
wordItem_charArrayTable[i] = new char[cnt][];
wordItem_frequencyTable[i] = new int[cnt];
total += cnt;
- int j = 0;
- while (j < cnt)
+
+ for (int j = 0; j < cnt; j++)
{
- // wordItemTable[i][j] = new WordItem();
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[0] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// frequency
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[1] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// length
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[2] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// handle
-
- // wordItemTable[i][j].frequency = buffer[0];
- wordItem_frequencyTable[i][j] = buffer[0];
-
- length = buffer[1];
+ // LUCENENET: Use BinaryReader methods instead of
ByteBuffer
+ int frequency = reader.ReadInt32();
Review Comment:
Please leave upstream comments in place.
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
/// <summary>
/// Load the datafile into this <see cref="WordDictionary"/>
/// </summary>
- /// <param name="dctFilePath">path to word dictionary
(coredict.dct)</param>
- /// <returns>number of words read</returns>
+ /// <param name="dctFilePath">Path to word dictionary
(coredict.dct)</param>
+ /// <returns>Number of words read</returns>
/// <exception cref="IOException">If there is a low-level I/O
error.</exception>
private int LoadMainDataFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
- // The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
+ int total = 0;
+
+ // The file only counted 6763 Chinese characters plus 5 reserved
slots (3756~3760).
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
+
+ // LUCENENET: Removed buffer and intBuffer arrays since
BinaryReader handles reading values directly in a more type-safe and readable
way.
+ // LUCENENET: Use BinaryReader to simplify endian conversion and
stream reading.
+
using (var dctFile = new FileStream(dctFilePath, FileMode.Open,
FileAccess.Read))
+ using (var reader = new BinaryReader(dctFile))
{
-
// GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
+ for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
{
- // if (i == 5231)
- // System.out.println(i);
+ int cnt = reader.ReadInt32(); // LUCENENET: Use
BinaryReader methods instead of ByteBuffer
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- // the dictionary was developed for C, and byte order must
be converted to work with Java
- cnt =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
if (cnt <= 0)
{
wordItem_charArrayTable[i] = null;
wordItem_frequencyTable[i] = null;
continue;
}
+
wordItem_charArrayTable[i] = new char[cnt][];
wordItem_frequencyTable[i] = new int[cnt];
total += cnt;
- int j = 0;
- while (j < cnt)
+
+ for (int j = 0; j < cnt; j++)
{
- // wordItemTable[i][j] = new WordItem();
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[0] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// frequency
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[1] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// length
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[2] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// handle
-
- // wordItemTable[i][j].frequency = buffer[0];
- wordItem_frequencyTable[i][j] = buffer[0];
-
- length = buffer[1];
+ // LUCENENET: Use BinaryReader methods instead of
ByteBuffer
+ int frequency = reader.ReadInt32();
+ int length = reader.ReadInt32();
+ reader.ReadInt32(); // Skip handle (unused)
+
+ wordItem_frequencyTable[i][j] = frequency;
+
if (length > 0)
{
- byte[] lchBuffer = new byte[length];
- dctFile.Read(lchBuffer, 0, lchBuffer.Length);
- tmpword = gb2312Encoding.GetString(lchBuffer); //
LUCENENET specific: use cached encoding instance from base class
+ byte[] lchBuffer = reader.ReadBytes(length);
+ string tmpword =
gb2312Encoding.GetString(lchBuffer); // LUCENENET: Use cached encoding instance
from base class
wordItem_charArrayTable[i][j] =
tmpword.ToCharArray();
}
else
{
- // wordItemTable[i][j].charArray = null;
wordItem_charArrayTable[i][j] = null;
}
- // System.out.println(indexTable[i].wordItems[j]);
Review Comment:
Please leave upstream comments in place.
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
/// <summary>
/// Load the datafile into this <see cref="WordDictionary"/>
/// </summary>
- /// <param name="dctFilePath">path to word dictionary
(coredict.dct)</param>
- /// <returns>number of words read</returns>
+ /// <param name="dctFilePath">Path to word dictionary
(coredict.dct)</param>
+ /// <returns>Number of words read</returns>
/// <exception cref="IOException">If there is a low-level I/O
error.</exception>
private int LoadMainDataFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
- // The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
+ int total = 0;
+
+ // The file only counted 6763 Chinese characters plus 5 reserved
slots (3756~3760).
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
+
+ // LUCENENET: Removed buffer and intBuffer arrays since
BinaryReader handles reading values directly in a more type-safe and readable
way.
+ // LUCENENET: Use BinaryReader to simplify endian conversion and
stream reading.
+
using (var dctFile = new FileStream(dctFilePath, FileMode.Open,
FileAccess.Read))
+ using (var reader = new BinaryReader(dctFile))
{
-
// GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
+ for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
{
- // if (i == 5231)
- // System.out.println(i);
+ int cnt = reader.ReadInt32(); // LUCENENET: Use
BinaryReader methods instead of ByteBuffer
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- // the dictionary was developed for C, and byte order must
be converted to work with Java
- cnt =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
if (cnt <= 0)
{
wordItem_charArrayTable[i] = null;
wordItem_frequencyTable[i] = null;
continue;
}
+
wordItem_charArrayTable[i] = new char[cnt][];
wordItem_frequencyTable[i] = new int[cnt];
total += cnt;
- int j = 0;
- while (j < cnt)
+
+ for (int j = 0; j < cnt; j++)
{
- // wordItemTable[i][j] = new WordItem();
Review Comment:
Please leave upstream comments in place.
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs:
##########
@@ -254,80 +254,83 @@ private void Load(string dictRoot)
/// <summary>
/// Load the datafile into this <see cref="BigramDictionary"/>
/// </summary>
- /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary
(bigramdict.dct)</param>
+ /// <param name="dctFilePath">Path to the Bigramdictionary
(bigramdict.dct)</param>
/// <exception cref="IOException">If there is a low-level I/O
error</exception>
public virtual void LoadFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
// The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
Review Comment:
Please declare `tmpword` here as it was done upstream.
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs:
##########
@@ -254,80 +254,83 @@ private void Load(string dictRoot)
/// <summary>
/// Load the datafile into this <see cref="BigramDictionary"/>
/// </summary>
- /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary
(bigramdict.dct)</param>
+ /// <param name="dctFilePath">Path to the Bigramdictionary
(bigramdict.dct)</param>
/// <exception cref="IOException">If there is a low-level I/O
error</exception>
public virtual void LoadFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
// The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
+
+ // LUCENENET: Removed buffer and intBuffer arrays since
BinaryReader handles reading values directly in a more type-safe and readable
way.
+ // LUCENENET specific - refactored constants for clarity
+ const int HEADER_POSITION = 3755;
+ const int MAX_VALID_LENGTH = 1000;
+
//using (RandomAccessFile dctFile = new
RandomAccessFile(dctFilePath, "r"))
using var dctFile = new FileStream(dctFilePath, FileMode.Open,
FileAccess.Read);
+ using var reader = new BinaryReader(dctFile);
// GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
+ for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
{
- string currentStr = GetCCByGB2312Id(i);
- // if (i == 5231)
- // System.out.println(i);
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- // the dictionary was developed for C, and byte order must be
converted to work with Java
- cnt =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
+ string currentStr = GetCCByGB2312Id(i);
+ int cnt;
+ try
+ {
+ cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader
methods instead of ByteBuffer
+ }
+ catch (EndOfStreamException)
+ {
+ // Reached end of file
+ break;
+ }
+
if (cnt <= 0)
{
continue;
}
- total += cnt;
- int j = 0;
- while (j < cnt)
+
+ for (int j = 0; j < cnt; j++)
{
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[0] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// frequency
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[1] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// length
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- // buffer[2] = ByteBuffer.wrap(intBuffer).order(
- // ByteOrder.LITTLE_ENDIAN).getInt();// handle
-
- length = buffer[1];
- if (length > 0)
+ // LUCENENET: Use BinaryReader methods instead of
ByteBuffer
+ int frequency = reader.ReadInt32();
+ int length = reader.ReadInt32();
+ reader.ReadInt32(); // Skip handle value (unused)
+
+ if (length > 0 && length <= MAX_VALID_LENGTH &&
dctFile.Position + length <= dctFile.Length)
{
- byte[] lchBuffer = new byte[length];
- dctFile.Read(lchBuffer, 0, lchBuffer.Length);
+ byte[] lchBuffer = reader.ReadBytes(length); //
LUCENENET: Use BinaryReader methods instead of ByteBuffer
+
//tmpword = new String(lchBuffer, "GB2312");
- tmpword = gb2312Encoding.GetString(lchBuffer); //
LUCENENET specific: use cached encoding instance from base class
+ string tmpword = gb2312Encoding.GetString(lchBuffer);
// LUCENENET specific: use cached encoding instance from base class
//tmpword =
Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
- if (i != 3755 + GB2312_FIRST_CHAR)
+
+
+ if (i != HEADER_POSITION + GB2312_FIRST_CHAR)
{
tmpword = currentStr + tmpword;
}
- char[] carray = tmpword.ToCharArray();
+
+ ReadOnlySpan<char> carray = tmpword.AsSpan();
long hashId = Hash1(carray);
int index = GetAvaliableIndex(hashId, carray);
+
if (index != -1)
{
if (bigramHashTable[index] == 0)
{
bigramHashTable[index] = hashId;
// bigramStringTable[index] = tmpword;
+
}
- frequencyTable[index] += buffer[0];
+ frequencyTable[index] += frequency;
}
}
- j++;
}
}
- // log.info("load dictionary done! " + dctFilePath + " total:" +
total);
Review Comment:
Please leave upstream comments in place.
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs:
##########
@@ -254,80 +254,83 @@ private void Load(string dictRoot)
/// <summary>
/// Load the datafile into this <see cref="BigramDictionary"/>
/// </summary>
- /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary
(bigramdict.dct)</param>
+ /// <param name="dctFilePath">Path to the Bigramdictionary
(bigramdict.dct)</param>
/// <exception cref="IOException">If there is a low-level I/O
error</exception>
public virtual void LoadFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
// The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
+
+ // LUCENENET: Removed buffer and intBuffer arrays since
BinaryReader handles reading values directly in a more type-safe and readable
way.
+ // LUCENENET specific - refactored constants for clarity
+ const int HEADER_POSITION = 3755;
+ const int MAX_VALID_LENGTH = 1000;
+
//using (RandomAccessFile dctFile = new
RandomAccessFile(dctFilePath, "r"))
using var dctFile = new FileStream(dctFilePath, FileMode.Open,
FileAccess.Read);
+ using var reader = new BinaryReader(dctFile);
// GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
+ for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
{
- string currentStr = GetCCByGB2312Id(i);
- // if (i == 5231)
Review Comment:
Please leave upstream comments in place.
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs:
##########
@@ -254,80 +254,83 @@ private void Load(string dictRoot)
/// <summary>
/// Load the datafile into this <see cref="BigramDictionary"/>
/// </summary>
- /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary
(bigramdict.dct)</param>
+ /// <param name="dctFilePath">Path to the Bigramdictionary
(bigramdict.dct)</param>
/// <exception cref="IOException">If there is a low-level I/O
error</exception>
public virtual void LoadFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
// The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
+
+ // LUCENENET: Removed buffer and intBuffer arrays since
BinaryReader handles reading values directly in a more type-safe and readable
way.
+ // LUCENENET specific - refactored constants for clarity
+ const int HEADER_POSITION = 3755;
+ const int MAX_VALID_LENGTH = 1000;
+
//using (RandomAccessFile dctFile = new
RandomAccessFile(dctFilePath, "r"))
using var dctFile = new FileStream(dctFilePath, FileMode.Open,
FileAccess.Read);
+ using var reader = new BinaryReader(dctFile);
// GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
+ for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
{
- string currentStr = GetCCByGB2312Id(i);
- // if (i == 5231)
- // System.out.println(i);
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- // the dictionary was developed for C, and byte order must be
converted to work with Java
- cnt =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
+ string currentStr = GetCCByGB2312Id(i);
+ int cnt;
+ try
+ {
+ cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader
methods instead of ByteBuffer
+ }
+ catch (EndOfStreamException)
Review Comment:
Please do not swallow exceptions that would be helpful for debugging if the
file format is incorrect.
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
/// <summary>
/// Load the datafile into this <see cref="WordDictionary"/>
/// </summary>
- /// <param name="dctFilePath">path to word dictionary
(coredict.dct)</param>
- /// <returns>number of words read</returns>
+ /// <param name="dctFilePath">Path to word dictionary
(coredict.dct)</param>
+ /// <returns>Number of words read</returns>
/// <exception cref="IOException">If there is a low-level I/O
error.</exception>
private int LoadMainDataFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
- // The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
+ int total = 0;
+
+ // The file only counted 6763 Chinese characters plus 5 reserved
slots (3756~3760).
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
+
+ // LUCENENET: Removed buffer and intBuffer arrays since
BinaryReader handles reading values directly in a more type-safe and readable
way.
+ // LUCENENET: Use BinaryReader to simplify endian conversion and
stream reading.
+
using (var dctFile = new FileStream(dctFilePath, FileMode.Open,
FileAccess.Read))
+ using (var reader = new BinaryReader(dctFile))
{
-
// GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
+ for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
{
- // if (i == 5231)
- // System.out.println(i);
+ int cnt = reader.ReadInt32(); // LUCENENET: Use
BinaryReader methods instead of ByteBuffer
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- // the dictionary was developed for C, and byte order must
be converted to work with Java
- cnt =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
if (cnt <= 0)
{
wordItem_charArrayTable[i] = null;
wordItem_frequencyTable[i] = null;
continue;
}
+
wordItem_charArrayTable[i] = new char[cnt][];
wordItem_frequencyTable[i] = new int[cnt];
total += cnt;
- int j = 0;
- while (j < cnt)
+
+ for (int j = 0; j < cnt; j++)
{
- // wordItemTable[i][j] = new WordItem();
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[0] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// frequency
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[1] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// length
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[2] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// handle
-
- // wordItemTable[i][j].frequency = buffer[0];
- wordItem_frequencyTable[i][j] = buffer[0];
-
- length = buffer[1];
+ // LUCENENET: Use BinaryReader methods instead of
ByteBuffer
+ int frequency = reader.ReadInt32();
+ int length = reader.ReadInt32();
+ reader.ReadInt32(); // Skip handle (unused)
+
+ wordItem_frequencyTable[i][j] = frequency;
+
if (length > 0)
{
- byte[] lchBuffer = new byte[length];
- dctFile.Read(lchBuffer, 0, lchBuffer.Length);
- tmpword = gb2312Encoding.GetString(lchBuffer); //
LUCENENET specific: use cached encoding instance from base class
+ byte[] lchBuffer = reader.ReadBytes(length);
+ string tmpword =
gb2312Encoding.GetString(lchBuffer); // LUCENENET: Use cached encoding instance
from base class
wordItem_charArrayTable[i][j] =
tmpword.ToCharArray();
}
else
{
- // wordItemTable[i][j].charArray = null;
wordItem_charArrayTable[i][j] = null;
}
- // System.out.println(indexTable[i].wordItems[j]);
- j++;
Review Comment:
Please increment `j` here as it was done upstream.
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs:
##########
@@ -162,7 +162,7 @@ public virtual long Hash1(char c)
/// </summary>
Review Comment:
At the top of this file, please change the declaration to throw on invalid
invalid input, since all of the callers expect exceptions for the proper
handling.
```c#
protected static readonly Encoding gb2312Encoding =
Encoding.GetEncoding("GB2312",
EncoderFallback.ExceptionFallback, DecoderFallback.ExceptionFallback);
```
##########
src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs:
##########
@@ -0,0 +1,72 @@
+using Lucene.Net.Util;
+using Lucene.Net.Analysis.Cn.Smart.Hhmm;
+using Lucene.Net.Attributes;
+using NUnit.Framework;
+using System;
+using System.IO;
+using System.Reflection;
+
+
+[TestFixture]
+[LuceneNetSpecific]
+public class DictionaryTests : LuceneTestCase
+{
+ private const string BigramResourceName =
"Lucene.Net.Tests.Analysis.SmartCn.Resources.bigramdict.dct";
+
+ [Test, Category("Dictionary")]
+ public void TestBigramDictionary()
+ {
+ using var resourceStream = GetResourceStream(BigramResourceName);
+
+ FileInfo _tempFile = CreateTempFile("bigramdict", ".dct");
+ CopyStreamToFile(resourceStream, _tempFile);
+
+ Assert.IsTrue(_tempFile.Length > 0, "Temp file is empty.");
+
+ BigramDictionary bigramDict = BigramDictionary.GetInstance();
+ bigramDict.LoadFromFile(_tempFile.FullName);
+
+ Assert.AreEqual(10, bigramDict.GetFrequency("啊hello".AsSpan()),
"Frequency for '啊hello' is incorrect.");
+ Assert.AreEqual(20, bigramDict.GetFrequency("阿world".AsSpan()),
"Frequency for '阿world' is incorrect.");
+ }
+
+ [Test, Category("Dictionary")]
Review Comment:
Please remove the Category attribute.
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs:
##########
@@ -254,80 +254,83 @@ private void Load(string dictRoot)
/// <summary>
/// Load the datafile into this <see cref="BigramDictionary"/>
/// </summary>
- /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary
(bigramdict.dct)</param>
+ /// <param name="dctFilePath">Path to the Bigramdictionary
(bigramdict.dct)</param>
/// <exception cref="IOException">If there is a low-level I/O
error</exception>
public virtual void LoadFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
// The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
+
+ // LUCENENET: Removed buffer and intBuffer arrays since
BinaryReader handles reading values directly in a more type-safe and readable
way.
+ // LUCENENET specific - refactored constants for clarity
+ const int HEADER_POSITION = 3755;
+ const int MAX_VALID_LENGTH = 1000;
+
//using (RandomAccessFile dctFile = new
RandomAccessFile(dctFilePath, "r"))
using var dctFile = new FileStream(dctFilePath, FileMode.Open,
FileAccess.Read);
+ using var reader = new BinaryReader(dctFile);
// GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
+ for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
{
- string currentStr = GetCCByGB2312Id(i);
- // if (i == 5231)
- // System.out.println(i);
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- // the dictionary was developed for C, and byte order must be
converted to work with Java
- cnt =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
+ string currentStr = GetCCByGB2312Id(i);
+ int cnt;
+ try
+ {
+ cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader
methods instead of ByteBuffer
+ }
+ catch (EndOfStreamException)
+ {
+ // Reached end of file
+ break;
+ }
+
if (cnt <= 0)
{
continue;
}
- total += cnt;
- int j = 0;
- while (j < cnt)
+
+ for (int j = 0; j < cnt; j++)
{
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[0] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// frequency
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[1] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// length
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- // buffer[2] = ByteBuffer.wrap(intBuffer).order(
- // ByteOrder.LITTLE_ENDIAN).getInt();// handle
-
- length = buffer[1];
- if (length > 0)
+ // LUCENENET: Use BinaryReader methods instead of
ByteBuffer
+ int frequency = reader.ReadInt32();
+ int length = reader.ReadInt32();
+ reader.ReadInt32(); // Skip handle value (unused)
Review Comment:
Please set the handle to `buffer[2]` as it was done upstream.
##########
src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs:
##########
@@ -0,0 +1,72 @@
+using Lucene.Net.Util;
+using Lucene.Net.Analysis.Cn.Smart.Hhmm;
+using Lucene.Net.Attributes;
+using NUnit.Framework;
+using System;
+using System.IO;
+using System.Reflection;
+
+
+[TestFixture]
+[LuceneNetSpecific]
+public class DictionaryTests : LuceneTestCase
+{
+ private const string BigramResourceName =
"Lucene.Net.Tests.Analysis.SmartCn.Resources.bigramdict.dct";
+
+ [Test, Category("Dictionary")]
Review Comment:
Please remove the Category attribute.
##########
src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs:
##########
@@ -0,0 +1,72 @@
+using Lucene.Net.Util;
+using Lucene.Net.Analysis.Cn.Smart.Hhmm;
+using Lucene.Net.Attributes;
+using NUnit.Framework;
+using System;
+using System.IO;
+using System.Reflection;
+
+
Review Comment:
Please put these tests in the `Lucene.Net.Analysis.Cn.Smart.Hhmm` namespace.
Since this file doesn't exist upstream, it should be moved into a subfolder
named `Support/Hhmm`.
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
/// <summary>
/// Load the datafile into this <see cref="WordDictionary"/>
/// </summary>
- /// <param name="dctFilePath">path to word dictionary
(coredict.dct)</param>
- /// <returns>number of words read</returns>
+ /// <param name="dctFilePath">Path to word dictionary
(coredict.dct)</param>
+ /// <returns>Number of words read</returns>
/// <exception cref="IOException">If there is a low-level I/O
error.</exception>
private int LoadMainDataFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
- // The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
+ int total = 0;
+
+ // The file only counted 6763 Chinese characters plus 5 reserved
slots (3756~3760).
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
+
+ // LUCENENET: Removed buffer and intBuffer arrays since
BinaryReader handles reading values directly in a more type-safe and readable
way.
+ // LUCENENET: Use BinaryReader to simplify endian conversion and
stream reading.
+
using (var dctFile = new FileStream(dctFilePath, FileMode.Open,
FileAccess.Read))
+ using (var reader = new BinaryReader(dctFile))
{
-
// GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
+ for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
{
- // if (i == 5231)
- // System.out.println(i);
+ int cnt = reader.ReadInt32(); // LUCENENET: Use
BinaryReader methods instead of ByteBuffer
Review Comment:
Please include the reason why we changed to use BinaryReader.
```c#
// LUCENENET: Use BinaryReader to decode little endian instead of
ByteBuffer, since this is the default in .NET
```
##########
src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs:
##########
@@ -0,0 +1,72 @@
+using Lucene.Net.Util;
+using Lucene.Net.Analysis.Cn.Smart.Hhmm;
+using Lucene.Net.Attributes;
+using NUnit.Framework;
+using System;
+using System.IO;
+using System.Reflection;
+
+
+[TestFixture]
+[LuceneNetSpecific]
+public class DictionaryTests : LuceneTestCase
+{
+ private const string BigramResourceName =
"Lucene.Net.Tests.Analysis.SmartCn.Resources.bigramdict.dct";
+
+ [Test, Category("Dictionary")]
+ public void TestBigramDictionary()
+ {
+ using var resourceStream = GetResourceStream(BigramResourceName);
+
+ FileInfo _tempFile = CreateTempFile("bigramdict", ".dct");
+ CopyStreamToFile(resourceStream, _tempFile);
+
+ Assert.IsTrue(_tempFile.Length > 0, "Temp file is empty.");
+
+ BigramDictionary bigramDict = BigramDictionary.GetInstance();
+ bigramDict.LoadFromFile(_tempFile.FullName);
+
+ Assert.AreEqual(10, bigramDict.GetFrequency("啊hello".AsSpan()),
"Frequency for '啊hello' is incorrect.");
+ Assert.AreEqual(20, bigramDict.GetFrequency("阿world".AsSpan()),
"Frequency for '阿world' is incorrect.");
+ }
+
+ [Test, Category("Dictionary")]
+ public void TestWordDictionaryGetInstance()
+ {
+ WordDictionary wordDict = WordDictionary.GetInstance();
+
+ Assert.NotNull(wordDict, "WordDictionary.GetInstance() returned
null.");
Review Comment:
Note that it is impossible for `WordDictionary.GetInstance()` to return
`null`, so this assert is unnecessary.
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs:
##########
@@ -340,77 +340,64 @@ private void SaveToObj(FileInfo serialObj)
/// <summary>
/// Load the datafile into this <see cref="WordDictionary"/>
/// </summary>
- /// <param name="dctFilePath">path to word dictionary
(coredict.dct)</param>
- /// <returns>number of words read</returns>
+ /// <param name="dctFilePath">Path to word dictionary
(coredict.dct)</param>
+ /// <returns>Number of words read</returns>
/// <exception cref="IOException">If there is a low-level I/O
error.</exception>
private int LoadMainDataFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
- // The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
+ int total = 0;
+
+ // The file only counted 6763 Chinese characters plus 5 reserved
slots (3756~3760).
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
+
+ // LUCENENET: Removed buffer and intBuffer arrays since
BinaryReader handles reading values directly in a more type-safe and readable
way.
+ // LUCENENET: Use BinaryReader to simplify endian conversion and
stream reading.
+
using (var dctFile = new FileStream(dctFilePath, FileMode.Open,
FileAccess.Read))
+ using (var reader = new BinaryReader(dctFile))
{
-
// GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
+ for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
{
- // if (i == 5231)
- // System.out.println(i);
+ int cnt = reader.ReadInt32(); // LUCENENET: Use
BinaryReader methods instead of ByteBuffer
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- // the dictionary was developed for C, and byte order must
be converted to work with Java
- cnt =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
if (cnt <= 0)
{
wordItem_charArrayTable[i] = null;
wordItem_frequencyTable[i] = null;
continue;
}
+
wordItem_charArrayTable[i] = new char[cnt][];
wordItem_frequencyTable[i] = new int[cnt];
total += cnt;
- int j = 0;
- while (j < cnt)
+
+ for (int j = 0; j < cnt; j++)
{
- // wordItemTable[i][j] = new WordItem();
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[0] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// frequency
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[1] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// length
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[2] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// handle
-
- // wordItemTable[i][j].frequency = buffer[0];
- wordItem_frequencyTable[i][j] = buffer[0];
-
- length = buffer[1];
+ // LUCENENET: Use BinaryReader methods instead of
ByteBuffer
Review Comment:
Please include the reason why we changed to use BinaryReader.
```c#
// LUCENENET: Use BinaryReader to decode little endian instead of
ByteBuffer, since this is the default in .NET
```
##########
src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs:
##########
@@ -0,0 +1,72 @@
+using Lucene.Net.Util;
+using Lucene.Net.Analysis.Cn.Smart.Hhmm;
+using Lucene.Net.Attributes;
+using NUnit.Framework;
+using System;
+using System.IO;
+using System.Reflection;
+
+
+[TestFixture]
+[LuceneNetSpecific]
+public class DictionaryTests : LuceneTestCase
+{
+ private const string BigramResourceName =
"Lucene.Net.Tests.Analysis.SmartCn.Resources.bigramdict.dct";
+
+ [Test, Category("Dictionary")]
+ public void TestBigramDictionary()
+ {
+ using var resourceStream = GetResourceStream(BigramResourceName);
+
+ FileInfo _tempFile = CreateTempFile("bigramdict", ".dct");
+ CopyStreamToFile(resourceStream, _tempFile);
+
+ Assert.IsTrue(_tempFile.Length > 0, "Temp file is empty.");
+
+ BigramDictionary bigramDict = BigramDictionary.GetInstance();
+ bigramDict.LoadFromFile(_tempFile.FullName);
+
+ Assert.AreEqual(10, bigramDict.GetFrequency("啊hello".AsSpan()),
"Frequency for '啊hello' is incorrect.");
+ Assert.AreEqual(20, bigramDict.GetFrequency("阿world".AsSpan()),
"Frequency for '阿world' is incorrect.");
+ }
+
+ [Test, Category("Dictionary")]
+ public void TestWordDictionaryGetInstance()
+ {
+ WordDictionary wordDict = WordDictionary.GetInstance();
Review Comment:
Please prepare the temp file for loading the same way that was done in the
example for `TestBigramDictionary()`.
##########
src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs:
##########
@@ -0,0 +1,72 @@
+using Lucene.Net.Util;
+using Lucene.Net.Analysis.Cn.Smart.Hhmm;
+using Lucene.Net.Attributes;
+using NUnit.Framework;
+using System;
+using System.IO;
+using System.Reflection;
+
+
+[TestFixture]
+[LuceneNetSpecific]
+public class DictionaryTests : LuceneTestCase
Review Comment:
Please setup the directory and name the files with the correct conventions
so they are loaded using the existing business logic. This gives us better test
coverage than using a custom file name and loading it inside of the test.
Also, it is important to consistently use the `CreateTempDir()` and
`CreateTempFile()` methods of `LuceneTestCase`, since they are already set up
to delete the temp files when the test is finished running. In this case, we
should use `CreateTempDir()` so we can update the
`AnalyzerProfile.ANALYSIS_DATA_DIR` at the beginning of the test with this
temporary location. Then all we need to do is put temp files named
`bigramdict.dct` and `coredict.dct` into that location for the corresponding
classes to load them.
To get the resource stream, please use
`this.GetType().FindAndGetResourceStream("bigramdict.dct");`. For that to work,
the `bigramdict.dct` file must be in the same directory as this file in the
project. It must not be in a subdirectory named `Resources`. So, these files
should be placed in the `Support/Hhmm` folder, along with `DictionaryTests.cs`.
`FindAndGetResourceStream()` is an extension method in the `J2N` namespace that
mimics the classpath functionality in Java for embedded. resource files, which
makes them relative to the class structure.
```c#
private const string BigramFileName = "bigramdict.dct";
private DirectoryInfo tempDir;
public override void OneTimeSetUp()
{
tempDir = CreateTempDir("smartcn-data");
AnalyzerProfile.ANALYSIS_DATA_DIR = tempDir.FullName;
}
[Test]
public void TestBigramDictionary()
{
using var resourceStream =
this.GetType().FindAndGetResourceStream(BigramFileName);
string tempFile = Path.Combine(tempDir.FullName, BigramFileName);
using (var tempStream = File.Create(tempFile))
{
resourceStream.CopyTo(tempStream);
Assert.IsTrue(tempStream.Length > 0, "Temp file is empty.");
}
BigramDictionary bigramDict = BigramDictionary.GetInstance();
Assert.AreEqual(10, bigramDict.GetFrequency("啊hello".AsSpan()),
"Frequency for '啊hello' is incorrect.");
Assert.AreEqual(20, bigramDict.GetFrequency("阿world".AsSpan()),
"Frequency for '阿world' is incorrect.");
}
```
##########
src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj:
##########
@@ -64,4 +64,9 @@
<PackageReference Include="System.Text.Encoding.CodePages"
Version="$(SystemTextEncodingCodePagesPackageVersion)" />
</ItemGroup>
+ <ItemGroup>
+ <InternalsVisibleTo Include="Lucene.Net.Tests.Analysis.SmartCn" />
Review Comment:
Please indent the ItemGroup and InternalsVisibleTo (using spaces)
appropriately.
##########
src/Lucene.Net.Tests.Analysis.SmartCn/DictionaryTests.cs:
##########
@@ -0,0 +1,72 @@
+using Lucene.Net.Util;
+using Lucene.Net.Analysis.Cn.Smart.Hhmm;
+using Lucene.Net.Attributes;
+using NUnit.Framework;
+using System;
+using System.IO;
+using System.Reflection;
+
+
+[TestFixture]
+[LuceneNetSpecific]
+public class DictionaryTests : LuceneTestCase
+{
+ private const string BigramResourceName =
"Lucene.Net.Tests.Analysis.SmartCn.Resources.bigramdict.dct";
+
+ [Test, Category("Dictionary")]
+ public void TestBigramDictionary()
+ {
+ using var resourceStream = GetResourceStream(BigramResourceName);
+
+ FileInfo _tempFile = CreateTempFile("bigramdict", ".dct");
+ CopyStreamToFile(resourceStream, _tempFile);
+
+ Assert.IsTrue(_tempFile.Length > 0, "Temp file is empty.");
+
+ BigramDictionary bigramDict = BigramDictionary.GetInstance();
+ bigramDict.LoadFromFile(_tempFile.FullName);
+
+ Assert.AreEqual(10, bigramDict.GetFrequency("啊hello".AsSpan()),
"Frequency for '啊hello' is incorrect.");
+ Assert.AreEqual(20, bigramDict.GetFrequency("阿world".AsSpan()),
"Frequency for '阿world' is incorrect.");
+ }
+
+ [Test, Category("Dictionary")]
+ public void TestWordDictionaryGetInstance()
+ {
+ WordDictionary wordDict = WordDictionary.GetInstance();
+
+ Assert.NotNull(wordDict, "WordDictionary.GetInstance() returned
null.");
+
Review Comment:
Please add additional assertions to ensure the data loaded correctly.
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs:
##########
@@ -254,80 +254,83 @@ private void Load(string dictRoot)
/// <summary>
/// Load the datafile into this <see cref="BigramDictionary"/>
/// </summary>
- /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary
(bigramdict.dct)</param>
+ /// <param name="dctFilePath">Path to the Bigramdictionary
(bigramdict.dct)</param>
/// <exception cref="IOException">If there is a low-level I/O
error</exception>
public virtual void LoadFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
// The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
+
+ // LUCENENET: Removed buffer and intBuffer arrays since
BinaryReader handles reading values directly in a more type-safe and readable
way.
+ // LUCENENET specific - refactored constants for clarity
+ const int HEADER_POSITION = 3755;
+ const int MAX_VALID_LENGTH = 1000;
+
//using (RandomAccessFile dctFile = new
RandomAccessFile(dctFilePath, "r"))
using var dctFile = new FileStream(dctFilePath, FileMode.Open,
FileAccess.Read);
+ using var reader = new BinaryReader(dctFile);
// GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
+ for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
{
- string currentStr = GetCCByGB2312Id(i);
- // if (i == 5231)
- // System.out.println(i);
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- // the dictionary was developed for C, and byte order must be
converted to work with Java
- cnt =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
+ string currentStr = GetCCByGB2312Id(i);
+ int cnt;
+ try
+ {
+ cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader
methods instead of ByteBuffer
Review Comment:
Please include the reason why we changed to use BinaryReader.
```c#
// LUCENENET: Use BinaryReader to decode little endian instead of
ByteBuffer, since this is the default in .NET
```
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs:
##########
@@ -254,80 +254,83 @@ private void Load(string dictRoot)
/// <summary>
/// Load the datafile into this <see cref="BigramDictionary"/>
/// </summary>
- /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary
(bigramdict.dct)</param>
+ /// <param name="dctFilePath">Path to the Bigramdictionary
(bigramdict.dct)</param>
/// <exception cref="IOException">If there is a low-level I/O
error</exception>
public virtual void LoadFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
// The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
+
+ // LUCENENET: Removed buffer and intBuffer arrays since
BinaryReader handles reading values directly in a more type-safe and readable
way.
+ // LUCENENET specific - refactored constants for clarity
+ const int HEADER_POSITION = 3755;
+ const int MAX_VALID_LENGTH = 1000;
+
//using (RandomAccessFile dctFile = new
RandomAccessFile(dctFilePath, "r"))
using var dctFile = new FileStream(dctFilePath, FileMode.Open,
FileAccess.Read);
+ using var reader = new BinaryReader(dctFile);
// GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
+ for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
{
- string currentStr = GetCCByGB2312Id(i);
- // if (i == 5231)
- // System.out.println(i);
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- // the dictionary was developed for C, and byte order must be
converted to work with Java
- cnt =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
+ string currentStr = GetCCByGB2312Id(i);
+ int cnt;
+ try
+ {
+ cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader
methods instead of ByteBuffer
+ }
+ catch (EndOfStreamException)
+ {
+ // Reached end of file
+ break;
+ }
+
if (cnt <= 0)
{
continue;
}
- total += cnt;
- int j = 0;
- while (j < cnt)
+
+ for (int j = 0; j < cnt; j++)
{
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[0] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// frequency
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[1] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// length
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- // buffer[2] = ByteBuffer.wrap(intBuffer).order(
- // ByteOrder.LITTLE_ENDIAN).getInt();// handle
-
- length = buffer[1];
- if (length > 0)
+ // LUCENENET: Use BinaryReader methods instead of
ByteBuffer
+ int frequency = reader.ReadInt32();
+ int length = reader.ReadInt32();
+ reader.ReadInt32(); // Skip handle value (unused)
+
+ if (length > 0 && length <= MAX_VALID_LENGTH &&
dctFile.Position + length <= dctFile.Length)
{
- byte[] lchBuffer = new byte[length];
- dctFile.Read(lchBuffer, 0, lchBuffer.Length);
+ byte[] lchBuffer = reader.ReadBytes(length); //
LUCENENET: Use BinaryReader methods instead of ByteBuffer
Review Comment:
Please include the reason why we changed to use BinaryReader.
```c#
// LUCENENET: Use BinaryReader to decode little endian instead of
ByteBuffer, since this is the default in .NET
```
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs:
##########
@@ -254,80 +254,83 @@ private void Load(string dictRoot)
/// <summary>
/// Load the datafile into this <see cref="BigramDictionary"/>
/// </summary>
- /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary
(bigramdict.dct)</param>
+ /// <param name="dctFilePath">Path to the Bigramdictionary
(bigramdict.dct)</param>
/// <exception cref="IOException">If there is a low-level I/O
error</exception>
public virtual void LoadFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
// The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
+
+ // LUCENENET: Removed buffer and intBuffer arrays since
BinaryReader handles reading values directly in a more type-safe and readable
way.
+ // LUCENENET specific - refactored constants for clarity
+ const int HEADER_POSITION = 3755;
Review Comment:
It looks like there is a problem here, since the original code used 3756
instead of 3755. I could be wrong, though. This might be due to the change in
loop format or it may be an incorrect comment in Java. We should ensure our
format exactly matches Lucene, as these files should be portable between .NET
and Java. Do note that there is a file in `analysis-data.zip`
[here](https://issues.apache.org/jira/browse/LUCENE-1629) that can be used to
check whether we still support the original format.
Whatever the case, this line should be updated either with the correct
number or a comment explaining why/how the upstream code was wrong.
##########
src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.csproj:
##########
@@ -62,5 +62,10 @@
<ItemGroup Condition=" '$(TargetFramework)' == 'net472' ">
<PackageReference Include="System.Text.Encoding.CodePages"
Version="$(SystemTextEncodingCodePagesPackageVersion)" />
</ItemGroup>
+ <ItemGroup>
+ <EmbeddedResource Include="Resources\bigramdict.dct" />
Review Comment:
Please expand this to include all files with these extensions, as was done
in the other analysis packages. Also, please place this element above the other
`ItemGroup` elements as was done in the morfologik and kuromoji projects and
ensure it is properly indented (with 2 spaces per level).
```xml
<ItemGroup>
<EmbeddedResource Include="**/*.dct" Exclude="bin/**/*;obj/**/*"
Label="Dictionary Test Data" />
</ItemGroup>
```
> Note that these files will need to be moved into the `Support/Hhmm` folder
along with the tests.
##########
src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs:
##########
@@ -254,80 +254,83 @@ private void Load(string dictRoot)
/// <summary>
/// Load the datafile into this <see cref="BigramDictionary"/>
/// </summary>
- /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary
(bigramdict.dct)</param>
+ /// <param name="dctFilePath">Path to the Bigramdictionary
(bigramdict.dct)</param>
/// <exception cref="IOException">If there is a low-level I/O
error</exception>
public virtual void LoadFromFile(string dctFilePath)
{
- int i, cnt, length, total = 0;
// The file only counted 6763 Chinese characters plus 5 reserved
slots 3756~3760.
// The 3756th is used (as a header) to store information.
- int[]
- buffer = new int[3];
- byte[] intBuffer = new byte[4];
- string tmpword;
+
+ // LUCENENET: Removed buffer and intBuffer arrays since
BinaryReader handles reading values directly in a more type-safe and readable
way.
+ // LUCENENET specific - refactored constants for clarity
+ const int HEADER_POSITION = 3755;
+ const int MAX_VALID_LENGTH = 1000;
+
//using (RandomAccessFile dctFile = new
RandomAccessFile(dctFilePath, "r"))
using var dctFile = new FileStream(dctFilePath, FileMode.Open,
FileAccess.Read);
+ using var reader = new BinaryReader(dctFile);
// GB2312 characters 0 - 6768
- for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
+ for (int i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR +
CHAR_NUM_IN_FILE; i++)
{
- string currentStr = GetCCByGB2312Id(i);
- // if (i == 5231)
- // System.out.println(i);
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- // the dictionary was developed for C, and byte order must be
converted to work with Java
- cnt =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian).GetInt32();
+ string currentStr = GetCCByGB2312Id(i);
+ int cnt;
+ try
+ {
+ cnt = reader.ReadInt32(); // LUCENENET: Use BinaryReader
methods instead of ByteBuffer
+ }
+ catch (EndOfStreamException)
+ {
+ // Reached end of file
+ break;
+ }
+
if (cnt <= 0)
{
continue;
}
- total += cnt;
- int j = 0;
- while (j < cnt)
+
+ for (int j = 0; j < cnt; j++)
{
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[0] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// frequency
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- buffer[1] =
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LittleEndian)
- .GetInt32();// length
- dctFile.Read(intBuffer, 0, intBuffer.Length);
- // buffer[2] = ByteBuffer.wrap(intBuffer).order(
- // ByteOrder.LITTLE_ENDIAN).getInt();// handle
-
- length = buffer[1];
- if (length > 0)
+ // LUCENENET: Use BinaryReader methods instead of
ByteBuffer
Review Comment:
Please include the reason why we changed to use BinaryReader.
```c#
// LUCENENET: Use BinaryReader to decode little endian instead of
ByteBuffer, since this is the default in .NET
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]