[ 
https://issues.apache.org/jira/browse/HDFS-8855?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14654107#comment-14654107
 ] 

Bob Hansen commented on HDFS-8855:
----------------------------------

Reproducer script:

{code}
#!/bin/bash

# Check that the hadoop command is available
hadoop fs -help > /dev/null 2> /dev/null
if [ $? != 0 ]; then
        echo "The hadoop command must be in your path"
        exit 1
fi

# segment, op=OPEN and offset are added to url_base                   
file_size=${file_size:-$[ 1024 * 1024 * 1024 ]}
count=${count:-1000000}
reads_per_pass=${reads_per_pass:-1000}
webhdfs_namenode=${webhdfs_namenode:-"localhost:50070"}
read_size=${read_size:-64000}
concurrent_reads=${concurrent_reads:-50}

url_base="http://"$webhdfs_namenode"/webhdfs/v1/tmp/bigfile_$$";
passes=$[ $count / $reads_per_pass ]
url_list_file=/tmp/file_list_$$.txt
namenode=${namenode:-`echo $url_base | grep -Po "(?<=http://)[^:/]*"`}

echo "Environment settings:"
echo "  file_size=$file_size"
echo "  count=$count"
echo "  reads_per_pass=$reads_per_pass"
echo "  webhdfs_namenode=$webhdfs_namenode"
echo "  read_size=$read_size"
echo "  concurrent_reads=$concurrent_reads"
echo "Outputs in /tmp/curl_[out|err]_$$"
echo "Computed values:"
echo "  url_base=$url_base"
echo "  passes=$passes"
echo "  url_list_file=$url_list_file"
echo "  namenode=$namenode"
echo
echo "Copying temp data..."
blocks_to_copy=$[ ( $file_size + 1023 ) / 1024 ]
dd count=$blocks_to_copy bs=1024 if=/dev/zero | tr "\0" "+" | hadoop fs 
-copyFromLocal - /tmp/bigfile_$$

echo "Generating URL list..."
# Generate the load profile
rm -f $url_list_file
for j in `seq 1 $reads_per_pass`; do
  rand=$(od -N 4 -t uL -An /dev/urandom | tr -d " ")
  offset=$[ ( $rand % (file_size / read_size) * read_size )]
  url=$url_base?op=OPEN\&user.name=$USER\&offset=$offset\&length=$read_size
  echo url = \"$url\" >> $url_list_file
done

# Open $concurrent_reads files and do $reads_per_pass random reads of 
$read_size 
for i in `seq 1 $passes` ; do
  # Kick off concurrent random reads
  for k in `seq 1 $concurrent_reads`; do
        curl -v -L -K $url_list_file > /tmp/curl_out_$$-$k.txt 
2>/tmp/curl_err_$$-$k.txt &
  done
  
  # Wait for all curl jobs to finish
  while [ `jobs | grep "Running.*curl" | wc -l` != 0 ]; do
    sleep 1s

    # Every second, count the connections on the webhdfs_namenode
        ssh $namenode "file=/tmp/netstat.out_\$\$ ; netstat -an > \$file ; echo 
-n 'ESTABLISHED: '; echo -n \`grep -c ESTABLISHED \$file\` ; echo -n '  
TIME_WAIT: '; echo -n \`grep -c TIME_WAIT \$file\` ; echo -n '  CLOSE_WAIT: '; 
grep -c CLOSE_WAIT \$file; rm \$file"&
        echo `grep "HTTP/1.1 [^23]" /tmp/curl_err_$$-* | wc -l` errors, "`grep 
"HTTP/1.1 200" /tmp/curl_err_$$-* | wc -l`" successes
  done

  # Display the completion time
  echo -n "Pass $i   ";
  date +%H:%M:%S.%N
  echo Total: `grep "HTTP/1.1 [^23]" /tmp/curl_err_$$-* | wc -l` errors, "`grep 
"HTTP/1.1 200" /tmp/curl_err_$$-* | wc -l`" successes
    
#  sleep $delay
done
{code}

> Webhdfs client leaks active NameNode connections
> ------------------------------------------------
>
>                 Key: HDFS-8855
>                 URL: https://issues.apache.org/jira/browse/HDFS-8855
>             Project: Hadoop HDFS
>          Issue Type: Bug
>          Components: webhdfs
>         Environment: HDP 2.2
>            Reporter: Bob Hansen
>
> The attached script simulates a process opening ~50 files via webhdfs and 
> performing random reads.  Note that there are at most 50 concurrent reads, 
> and all webhdfs sessions are kept open.  Each read is ~64k at a random 
> position.  
> The script periodically (once per second) shells into the NameNode and 
> produces a summary of the socket states.  For my test cluster with 5 nodes, 
> it took ~30 seconds for the NameNode to have ~25000 active connections and 
> fails.
> It appears that each request to the webhdfs client is opening a new 
> connection to the NameNode and keeping it open after the request is complete. 
>  If the process continues to run, eventually (~30-60 seconds), all of the 
> open connections are closed and the NameNode recovers.  
> This smells like SoftReference reaping.  Are we using SoftReferences in the 
> webhdfs client to cache NameNode connections but never re-using them?



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to