$ awk 'BEGIN{len=0;}{if(length($0)>len){len=length($0);printf("%i : %s\n",len,$0);}}' /usr/share/wordlists/rockyou.txt 6 : 123456 9 : 123456789 10 : 1234567890 11 : christopher 13 : tequieromucho 16 : manchesterunited 17 : mychemicalromance 18 : 123456789123456789 39 : Lets you update your FunNotes and more! 40 : 1111111111111111111111111111111111111111 42 : RockYou account is required for Voicemail. 49 : /* {--friendster-layouts.com css code start--} */ awk: cmd. line:1: (FILENAME=rockyou.txt FNR=602044) warning: Invalid multibyte data detected. There may be a mismatch between your data and your locale. 59 : http://www.rockyou.com/fxtext/fxtext-create.php?partner=hi5 77 : vabfdvfdlvhjibfedblsfndilvbgilebvgdlsbgvhbesghklhyubvuwklfbrebgfyurerebgyureb 165 : lllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllll 222 : <table style="border-collapse:collapse;"><tr><td colspan="2"><embed src="http://apps.rockyou.com/photofx.swf" quality="high" scale="noscale" salign="lt" width="325" height="260" wmode="transparent" flashvars="imgpath=http% 255 : <object width="206" height="224"><param name="movie" value="http://www.vivelatino.com.mx/contador.swf"></param><param name="wmode" value="transparent"></param><embed src="http://www.vivelatino.com.mx/contador.swf" type="application/x-shockwave-flash" wmod 257 : <style type=\\'text/css\\'>body{ background: url(http://recursos.fotocajon.com/enchulatupagina/img003/zxddXgCBLcTi.jpg) white center no-repeat fixed; } table, .heading_profile, .heading_profile_left, table td, #p_container, #p_nav_primary, #top_header, #p_n 262 : <style type=\\'text/css\\'>.bg_content{background-image:url(http://img360.imageshack.us/img360/5198/escanear00532wq9.jpg);}.bg_content{background-repeat:repeat;}</STYLE><a href=\\'http://hi5.enchulatupagina.com\\' target=\\'_top\\'><img src=\\'http://hi5.enchula 266 : <div id=\\'24813\\'><a href=\\'http://www.revistate.com\\'><img src=\\'http://www.revistate.com/uploads/20080218/rq/rqwpcf28o1pyb10yfzen53kmuipsi0_PAPARAZZI.jpg\\' border=0 alt=\\'Hazte famoso en www.revistate.com\\'></a></div><div id=\\'72891\\'><a href=\\'http://w 285 : <div align=\\\\\\'center\\\\\\' style=\\\\\\'font:bold 11px Verdana; width:310px\\\\\\'><a style=\\\\\\'background-color:#eeeeee;display:block;width:310px;border:solid 2px black; padding:5px\\\\\\' href=\\\\\\'http://www.musik-live.net\\\\\\' target=\\\\\\'_blank\\\\\\'>Playing/Tangga

$ ./wordlistcleanser.sh rockyou.txt wewillrockyou.txt Cleaning rockyou.txt... Output file : wewillrockyou.txt Removing phrases... grep: rockyou.txt: binary file matches Extracting then removing websites... Extracting then removing emails... Getting stats on wewillrockyou.txt, extracted emails and extracted websites... Cleansing completed. $ wc -l rockyou.txt wewillrockyou.txt 14344392 rockyou.txt 14246262 wewillrockyou.txt 28590654 total $ expr 14344392 - 14246262 98130

#!/bin/bash
#
# wordlistcleanser.sh       gerbil2018 [twitter: @gerbilByte]
#
# This file is used to clean 'rockyou.txt' from all the crap to leave just
# single words.
# It will also cleanse other wordlists too.
#
# Usage:
# wordlistcleanser.sh infile [outfile]
#
# WARNING: If an output file isn't specified, then the input will be
# overwritten (permissions allowing).
#
# Example:
# ./wordlistcleanser.sh /usr/share/wordlists/rockyou.txt ./wewillrockyou.txt
#

infile=$1
outfile=$2
version="1.0"
author="gerbil"

if [ $# -lt 1 ];
   then
   printf "\nwordlistcleanser v%s - %s 2018 \n\nThis is a simple script that will remove \'phrases\', emails and websites from wordlist files.\nEmails and websites will be stored as files under the current directory.\n\n" ${version} ${author}
   printf "Usage:\n\t%s infile.txt [outfile.txt]\n\nWARNING: If an output file isn't specified, then the input will be overwritten (permissions allowing).\n\nExample:\n\t./wordlistcleanser.sh ./rockyou.txt ./wewillrockyou.txt\n\nHave fun! :) \n-%s\n" $0 ${author}
   exit
fi

baseinfile=`basename ${infile}`
baseinfile=${baseinfile%.*}
printf "Cleaning %s...\n " ${infile};

#Check input file exists...
if ! [ -a ${infile} ] ;
   then #inputfile doesn't exist.
   printf " %s doesn't exist!\n" ${infile}
   exit
fi

#Check if inputfile is to be overwritten or not...
if [ ${outfile}X == X ] ;
  then #no output file specified, therefore destruct mode! ;P
  outfile=${infile}
  printf " No output file specified, therefore output will be stor ed at %s\n" ${outfile}
  # rm -f ${infile} # just to save space
else
   printf "Output file : ${outfile}\n"
fi

#Removing phrases...
printf "Removing phrases...\n"
grep -v ' ' ${infile} > /tmp/ry1.txt

#Extracting then removing websites...
printf "Extracting then removing websites...\n"
grep http[s]*:// /tmp/ry1.txt > ./${baseinfile}_websites.txt
grep -v http[s]*:// /tmp/ry1.txt > /tmp/ry2.txt
rm -f /tmp/ry1.txt # just to save space

#Extracting then removing emails...
printf "Extracting then removing emails...\n"
egrep '[a-zA-Z0-9\-\.]+@[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,5}' /tmp/ry2.txt > ./${baseinfile}_emails.txt
egrep -v '[a-zA-Z0-9\-\.]+@[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,5}' /tmp/ry2.txt > ${outfile}
rm -f /tmp/ry2.txt # just to save space

#Get stats on leftover file (length of each word and count of each, I know there are no words longer than 1000 
#characters)...
printf "Getting stats on %s, extracted emails and extracted websites...\n" ${outfile}
printf "Emails extracted: `wc -l ./${baseinfile}_emails.txt`\n" > ./${outfile%.*}_stats.txt
printf "Websites extracted: `wc -l ./${baseinfile}_websites.txt`\n" >> ./${outfile%.*}_stats.txt
printf "\nStats on %s : \n\n" ${outfile} >> ./${outfile%.*}_stats.txt
awk 'BEGIN{charcounts[1000]=0;len=0;printf("word length : count\n------------:-----\n");}{charcounts[length($0)]++;}END{for(i=0;i<=1000;i++){printf("%11i : %i\n",i,charcounts[i]);}}' ${outfile} | grep -v ': 0'$ >> ./${outfile%.*}_stats.txt

printf "Cleansing completed.\n\n"