# python version 3.6
# DGA feature building

# entropy
def entropy(string):
    # get probability of chars in string
    prob = [float(string.count(c)) / len(string) for c in dict.fromkeys(list(string))]

    # calculate the entropy
    entropy = -sum([p * math.log(p) / math.log(2.0) for p in prob])
    return entropy

# apply entropy to the domain
df["entropy"] = df["domain"].apply(entropy)

# Additional features

# hyphen count
df["hyphen_count"] = df.domain.str.count("-")

# dot count
df["dot_count"] = df.domain.str.count(r"\.")

# string length of the full domain
df["string_len_domain"] = df.domain.str.len()

# tld length
df["tld_len"] = df.tld.str.len()

# count of vowels and consonents
vowels = set("aeiou")
cons = set("bcdfghjklmnpqrstvwxyz")
df["Vowels"] = [sum(1 for c in x if c in vowels) for x in df["domain"]]
df["Consonents"] = [sum(1 for c in x if c in cons) for x in df["domain"]]

# consonents to vowels ratio
df["consec_vowel_ratio"] = (df["Vowels"] / df["Consonents"]).round(5)

# count the number of syllables in a word
import re
def syllables(word):
    word = word.lower()
    if word.endswith("e"):
        word = word[:-1]
    count = len(re.findall("[aeiou]+", word))
    return count

df["syllables"] = df["domain"].apply(syllables)

# prediction code
from xgboost import XGBClassifier

pred = pd.DataFrame(df.data, columns=columns)  # load the dataset as a pandas data frame
y = (df.benign_dga)  # the binary target variable 1 for DGA, 0 for benign. This was assigned in the data collection

# create training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dy, y, test_size=0.3)

# fit model
model = XGBClassifier(objective="binary:logistic")
model.fit(X_train, y_train)

# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))