In [1]:
import pickle
import pandas as pd
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
In [2]:
#get model
model_pkl_file = "model.pkl"

with open(model_pkl_file, 'rb') as file:  
    model = pickle.load(file)
In [3]:
#get vectorizer
vectorizer_pkl_file = "vectorizer.pkl"

with open(vectorizer_pkl_file, 'rb') as file:  
    vectorizer = pickle.load(file)
In [4]:
#def of lemmatization 
words = {}
with open('odm.txt', 'r') as odm:
    for line in odm:
        t = line.strip().split(',')
        tt = [word.strip().lower() for word in t]
        for w in tt[1:]:
            words[w] = tt[0]

def lematize(w):
    return words.get(w,w)
In [5]:
# def of data scraper
def get_data(url, element, name):
        
    response = requests.get(url)

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, "html.parser")

    # Find specific elements on the page
    if(element == "tag"):
        elements = soup.find_all(name)
    elif(element == "div"):
        elements = soup.find_all("div", {"class": name})
    else:
        elements = soup.find_all("span", {"class": name})
    
    # Put new data in df
    headlines  = []
    for element in elements:
        headlines.append(element.text)
    new_df = pd.DataFrame({"headline":[]})
    new_df["headline"] = headlines
    
    #remove non-headlines
    new_df = new_df[new_df['headline'].map(len) > 15].reset_index(drop=True)
    new_df['url'] = url
    
    return new_df
In [6]:
#prediction
def find_clickbaits(df):
    df['headline_old'] = df['headline']
    #lemmatization 
    df['headline'] = df['headline'].str.replace(r'[^\w\s]+', '', regex=True).str.lower().str.split()
    for i in range(len(df['headline'])):
        for j, word in enumerate(df["headline"][i]):
            word = lematize(word)
            df["headline"][i][j] = word
        df['headline'][i] = ' '.join(df['headline'][i])
#     df = df.dropna()
    #vectorization
    X = vectorizer.transform(df["headline"])
    #prediction
    df['prediction'] = model.predict(X)
    return df
In [7]:
wp_data = get_data("https://www.wp.pl/", "div", "wp-teaser-text-shadow w-full text-white group-hover:underline text-xl/6")
onet_data = get_data("https://www.onet.pl/", "tag", "h3")
tvn24_data = get_data("https://tvn24.pl/", "tag", "h2")
interia_data = get_data("https://www.interia.pl/", "span", "title-text")
rmf_data = get_data("https://www.rmf24.pl/", "span", "indent")
data = pd.concat([wp_data, onet_data, tvn24_data, interia_data, rmf_data]).reset_index(drop=True)
In [8]:
prediction = find_clickbaits(data)
In [9]:
def create_chart(*args):
    chart = {}
    for arg in args:
        chart[arg] = sum(prediction['prediction'][prediction['url'] == arg])/len(prediction['prediction'][prediction['url'] == arg])*100

    courses = list(chart.keys())
    values = list(chart.values())

    fig = plt.figure(figsize = (10, 5))

    # creating the bar plot
    plt.bar(courses, values, color ='maroon',
            width = 0.4)

    plt.xlabel("Website")
    plt.ylabel("% of clickbaits")
    plt.title("Percentage of clickbaits in headlines")
    plt.show()
In [10]:
create_chart("https://www.wp.pl/", "https://www.onet.pl/", "https://tvn24.pl/", "https://www.interia.pl/", "https://www.rmf24.pl/")