Author(s): Greg Postalian-Yrausquin Originally published on Towards AI. This is a great example of how more than one ML step can be used to achieve a goal. In this exercise, I will combine NLP (Doc2Vec) with binary classification to extract offensive and hate language from a set of tweets. Doc2Vec is chosen in this case because it is not pretrained, so it does not rely on a previously provided vocabulary (who knows what we might find… and the tweets are filled with typos, etc). Doc2Vec is a good tool because: 1) as I say does not rely on pre-defined vocabulary and 2) it is a “complete” model, it considers the word in the context of its sentence, gives more accurate results than simpler vectorization tools like TF-IDF. First, let’s import the libraries import numpy as npimport pandas as pdimport jsonpd.options.mode.chained_assignment = Nonefrom io import StringIOfrom html.parser import HTMLParserimport reimport nltkfrom nltk.corpus import stopwordsnltk.download('stopwords')nltkstop = stopwords.words('english')from gensim.models.doc2vec import Doc2Vec, TaggedDocumentfrom nltk.tokenize import word_tokenizefrom nltk.stem.snowball import SnowballStemmernltk.download('punkt')snow = SnowballStemmer(language='english')from sklearn.preprocessing import StandardScalerfrom sklearn.model_selection import train_test_splitimport matplotlib.pyplot as pltimport matplotlib.cm as cmimport seaborn as snsimport warningsimport tensorflow as tfimport seaborn as snsfrom sklearn.preprocessing import MinMaxScalerfrom sklearn.metrics import confusion_matrixfrom sklearn.metrics import ConfusionMatrixDisplayfrom sklearn.metrics import classification_reportfrom sklearn.utils import resample I have uploaded two datasets, one with a list of possibly offensive tweets and another with a list of generic tweets, with them, I build the dataset to study. Also, I am uploading several datasets that I use to clean the data from words that bring no or generic meaning like place names, personal names, etc. There are many versions of these available on the internet, they can be found with a simple search. Before uploading them I made sure they made sense and cleaned them. maindataset = pd.read_csv("labeled_data.csv")maindataset2 = pd.read_csv("twitter_dataset.csv", encoding = "ISO-8859-1")countries = pd.readjson("countries.json")countries["country"] = countries["country"].str.lower()countries = pd.DataFrame(countries["country"].apply(lambda x: str(x).replace('-',' ').replace('.',' ').replace('',' ').replace(',',' ').replace(':',' ').split(" ")).explode())countries.columns = ['word']countries["replacement"] = "xcountryx"provincies = pd.read_csv("countriesprovincies.csv")provincies1 = provincies[["name"]]provincies1["name"] = provincies1["name"].str.lower()provincies1 = pd.DataFrame(provincies1["name"].apply(lambda x: str(x).replace('-',' ').replace('.',' ').replace('',' ').replace(',',' ').replace(':',' ').split(" ")).explode())provincies1.columns = ['word']provincies1["replacement"] = "xprovincex"provincies2 = provincies[["name_alt"]]provincies2["name_alt"] = provincies2["name_alt"].str.lower()provincies2 = pd.DataFrame(provincies2["namealt"].apply(lambda x: str(x).replace('-',' ').replace('.',' ').replace('',' ').replace(',',' ').replace(':',' ').split(" ")).explode())provincies2.columns = ['word']provincies2["replacement"] = "xprovincex"provincies3 = provincies[["type_en"]]provincies3["type_en"] = provincies3["type_en"].str.lower()provincies3 = pd.DataFrame(provincies3["typeen"].apply(lambda x: str(x).replace('-',' ').replace('.',' ').replace('',' ').replace(',',' ').replace(':',' ').split(" ")).explode())provincies3.columns = ['word']provincies3["replacement"] = "xsubdivisionx"provincies4 = provincies[["admin"]]provincies4["admin"] = provincies4["admin"].str.lower()provincies4 = pd.DataFrame(provincies4["admin"].apply(lambda x: str(x).replace('-',' ').replace('.',' ').replace('',' ').replace(',',' ').replace(':',' ').split(" ")).explode())provincies4.columns = ['word']provincies4["replacement"] = "xcountryx"provincies5 = provincies[["geonunit"]]provincies5["geonunit"] = provincies5["geonunit"].str.lower()provincies5 = pd.DataFrame(provincies5["geonunit"].apply(lambda x: str(x).replace('-',' ').replace('.',' ').replace('',' ').replace(',',' ').replace(':',' ').split(" ")).explode())provincies5.columns = ['word']provincies5["replacement"] = "xcountryx"provincies6 = provincies[["gn_name"]]provincies6["gn_name"] = provincies6["gn_name"].str.lower()provincies6 = pd.DataFrame(provincies6["gnname"].apply(lambda x: str(x).replace('-',' ').replace('.',' ').replace('',' ').replace(',',' ').replace(':',' ').split(" ")).explode())provincies6.columns = ['word']provincies6["replacement"] = "xcountryx"provincies = pd.concat([provincies1,provincies2,provincies3,provincies4,provincies5,provincies6], axis=0, ignore_index=True)currencies = pd.readjson("country-by-currency-name.json")currencies1 = currencies[["country"]]currencies1["country"] = currencies1["country"].str.lower()currencies1 = pd.DataFrame(currencies1["country"].apply(lambda x: str(x).replace('-',' ').replace('.',' ').replace('',' ').replace(',',' ').replace(':',' ').split(" ")).explode())currencies1.columns = ['word']currencies1["replacement"] = "xcountryx"currencies2 = currencies[["currency_name"]]currencies2["currency_name"] = currencies2["currency_name"].str.lower()currencies2 = pd.DataFrame(currencies2["currencyname"].apply(lambda x: str(x).replace('-',' ').replace('.',' ').replace('',' ').replace(',',' ').replace(':',' ').split(" ")).explode())currencies2.columns = ['word']currencies2["replacement"] = "xcurrencyx"currencies = pd.concat([currencies1,currencies2], axis=0, ignore_index=True)firstnames = pd.readcsv("interall.csv", header=None)firstnames = firstnames[firstnames[1]>=10000]firstnames = firstnames[[0]]firstnames[0] = firstnames[0].str.lower()firstnames = pd.DataFrame(firstnames[0].apply(lambda x: str(x).replace('-',' ').replace('.',' ').replace('',' ').replace(',',' ').replace(':',' ').split(" ")).explode())firstnames.columns = ['word']firstnames["replacement"] = "xfirstnamex"lastnames = pd.readcsv("intersurnames.csv", header=None)lastnames = lastnames[lastnames[1]>=10000]lastnames = lastnames[[0]]lastnames[0] = lastnames[0].str.lower()lastnames = pd.DataFrame(lastnames[0].apply(lambda x: str(x).replace('-',' ').replace('.',' ').replace('',' ').replace(',',' ').replace(':',' ').split(" ")).explode())lastnames.columns = ['word']lastnames["replacement"] = "xlastnamex"temporaldata = pd.read_csv("temporal.csv")dictionary = pd.concat([lastnames,temporaldata,firstnames,currencies,provincies,countries], axis=0, ignore_index=True)dictionary = dictionary.groupby(["word"]).first().reset_index(drop=False)dictionary = dictionary.dropna()maindataset It might be necessary to understand the data a little. From Kaggle: “count number of CrowdFlower users who coded each tweet (min is 3, sometimes more users coded a tweet when judgments were determined to be unreliable by CF) hate_speech number of CF users who judged the tweet to be hate speech offensive_language number of CF users who judged the tweet to be offensive neither number of CF users who judged the tweet to be neither offensive nor non-offensive class class label for majority of CF users. 0 — hate speech 1 — offensive language 2 — neither” With that, I will filter out the column for class and keep only two, if at least one user flag the tweet as offensive or hate speech then it is. maindataset['hate_speech'] = np.where(maindataset['hate_speech']>0,1,0)maindataset['offensive_language'] = np.where(maindataset['offensive_language']>0,1,0)maindataset = maindataset[['hate_speech', 'offensive_language', 'tweet']]maindataset Now, I’ll prepare the other dataset (with the clean tweets), and join it to the original one maindataset2 = maindataset2[['text']]maindataset2.columns = ['tweet']maindataset2['hate_speech'] = 0maindataset2['offensive_language'] = 0maindataset2 = maindataset2[['hate_speech','offensive_language','tweet']]maindataset = pd.concat([maindataset,maindataset2], ignore_index=True) Here I use several functions to clean the text that I like to keep in my belt: Strip HTML tags Replace words using the dictionary crafted above Remove punctuation, double spaces, etc. class MLStripper(HTMLParser): def init(self): super().init() self.reset() self.strict = False self.convert_charrefs= True self.text = StringIO() def handle_data(self, d): self.text.write(d) def get_data(self): return self.text.getvalue()def strip_tags(html): s = MLStripper() s.feed(html) return s.get_data()def replace_words(tt, lookp_dict): temp = tt.split() res = [] for wrd in temp: res.append(lookp_dict.get(wrd, wrd)) res = ' '.join(res) return resdef preprepare(eingang): ausgang = striptags(eingang) ausgang = eingang.lower() ausgang = ausgang.replace(u'\xa0', u' ') ausgang = re.sub(r'^\s$',' ',str(ausgang)) ausgang = ausgang.replace('|', ' ') ausgang = ausgang.replace('ï', ' ') ausgang = ausgang.replace('»', ' ') ausgang = ausgang.replace('¿', '. ') ausgang = ausgang.replace('', ' ') ausgang = ausgang.replace('"', ' ') ausgang = ausgang.replace("'", " ") ausgang = ausgang.replace('?', ' ') ausgang = ausgang.replace('!', ' ') ausgang = ausgang.replace(',', ' ') ausgang = ausgang.replace(';', ' ') ausgang = ausgang.replace('.', ' ') ausgang = ausgang.replace("(", " ") ausgang = ausgang.replace(")", " ") ausgang = ausgang.replace("{", " ") ausgang = ausgang.replace("}", " ") ausgang = ausgang.replace("[", " ") ausgang = ausgang.replace("]", " ") ausgang = ausgang.replace("~", " ") ausgang = ausgang.replace("@", " ") ausgang = ausgang.replace("#", " ") ausgang = ausgang.replace("$", " ") ausgang = ausgang.replace("%", " ") ausgang = ausgang.replace("^", " ") ausgang = ausgang.replace("&", " ") ausgang = ausgang.replace("", " ") ausgang = ausgang.replace("<", " ") ausgang = ausgang.replace(">", " ") ausgang = ausgang.replace("/", " ") ausgang = ausgang.replace("\", " ") ausgang = ausgang.replace("`", " ") ausgang = ausgang.replace("+", " ") ausgang = ausgang.replace("=", " ") ausgang = ausgang.replace("", " ") ausgang = ausgang.replace("-", " ") ausgang = ausgang.replace(':', ' ') ausgang = ausgang.replace('\n', ' ').replace('\r', ' ') ausgang = ausgang.replace(" +", " ") ausgang = ausgang.replace(" +", " ") ausgang = ausgang.replace('?', ' ') ausgang = […]