#!/bin/bash
#
# wordlistcleanser.sh       gerbil2018 [twitter: @gerbilByte]
#
# This file is used to clean 'rockyou.txt' from all the crap to leave just
# single words.
# It will also cleanse other wordlists too.
#
# Usage:
# wordlistcleanser.sh infile [outfile]
#
# WARNING: If an output file isn't specified, then the input will be
# overwritten (permissions allowing).
#
# Example:
# ./wordlistcleanser.sh /usr/share/wordlists/rockyou.txt ./wewillrockyou.txt
#

infile=$1
outfile=$2
version="1.0"
author="gerbil"

if [ $# -lt 1 ];
   then
   printf "\nwordlistcleanser v%s - %s 2018 \n\nThis is a simple script that will remove \'phrases\', emails and websites from wordlist files.\nEmails and websites will be stored as files under the current directory.\n\n" ${version} ${author}
   printf "Usage:\n\t%s infile.txt [outfile.txt]\n\nWARNING: If an output file isn't specified, then the input will be overwritten (permissions allowing).\n\nExample:\n\t./wordlistcleanser.sh ./rockyou.txt ./wewillrockyou.txt\n\nHave fun! :) \n-%s\n" $0 ${author}
   exit
fi

baseinfile=`basename ${infile}`
baseinfile=${baseinfile%.*}
printf "Cleaning %s...\n " ${infile};

#Check input file exists...
if ! [ -a ${infile} ] ;
   then #inputfile doesn't exist.
   printf " %s doesn't exist!\n" ${infile}
   exit
fi

#Check if inputfile is to be overwritten or not...
if [ ${outfile}X == X ] ;
  then #no output file specified, therefore destruct mode! ;P
  outfile=${infile}
  printf " No output file specified, therefore output will be stor ed at %s\n" ${outfile}
  # rm -f ${infile} # just to save space
else
   printf "Output file : ${outfile}\n"
fi

#Removing phrases...
printf "Removing phrases...\n"
grep -v ' ' ${infile} > /tmp/ry1.txt

#Extracting then removing websites...
printf "Extracting then removing websites...\n"
grep http[s]*:// /tmp/ry1.txt > ./${baseinfile}_websites.txt
grep -v http[s]*:// /tmp/ry1.txt > /tmp/ry2.txt
rm -f /tmp/ry1.txt # just to save space

#Extracting then removing emails...
printf "Extracting then removing emails...\n"
egrep '[a-zA-Z0-9\-\.]+@[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,5}' /tmp/ry2.txt > ./${baseinfile}_emails.txt
egrep -v '[a-zA-Z0-9\-\.]+@[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,5}' /tmp/ry2.txt > ${outfile}
rm -f /tmp/ry2.txt # just to save space

#Get stats on leftover file (length of each word and count of each, I know there are no words longer than 1000 
#characters)...
printf "Getting stats on %s, extracted emails and extracted websites...\n" ${outfile}
printf "Emails extracted: `wc -l ./${baseinfile}_emails.txt`\n" > ./${outfile%.*}_stats.txt
printf "Websites extracted: `wc -l ./${baseinfile}_websites.txt`\n" >> ./${outfile%.*}_stats.txt
printf "\nStats on %s : \n\n" ${outfile} >> ./${outfile%.*}_stats.txt
awk 'BEGIN{charcounts[1000]=0;len=0;printf("word length : count\n------------:-----\n");}{charcounts[length($0)]++;}END{for(i=0;i<=1000;i++){printf("%11i : %i\n",i,charcounts[i]);}}' ${outfile} | grep -v ': 0'$ >> ./${outfile%.*}_stats.txt

printf "Cleansing completed.\n\n"