import pickle
import pandas as pd
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
#get model
model_pkl_file = "model.pkl"
with open(model_pkl_file, 'rb') as file:
model = pickle.load(file)
#get vectorizer
vectorizer_pkl_file = "vectorizer.pkl"
with open(vectorizer_pkl_file, 'rb') as file:
vectorizer = pickle.load(file)
#def of lemmatization
words = {}
with open('odm.txt', 'r') as odm:
for line in odm:
t = line.strip().split(',')
tt = [word.strip().lower() for word in t]
for w in tt[1:]:
words[w] = tt[0]
def lematize(w):
return words.get(w,w)
# def of data scraper
def get_data(url, element, name):
response = requests.get(url)
# Parse the HTML content of the page
soup = BeautifulSoup(response.content, "html.parser")
# Find specific elements on the page
if(element == "tag"):
elements = soup.find_all(name)
elif(element == "div"):
elements = soup.find_all("div", {"class": name})
else:
elements = soup.find_all("span", {"class": name})
# Put new data in df
headlines = []
for element in elements:
headlines.append(element.text)
new_df = pd.DataFrame({"headline":[]})
new_df["headline"] = headlines
#remove non-headlines
new_df = new_df[new_df['headline'].map(len) > 15].reset_index(drop=True)
new_df['url'] = url
return new_df
#prediction
def find_clickbaits(df):
df['headline_old'] = df['headline']
#lemmatization
df['headline'] = df['headline'].str.replace(r'[^\w\s]+', '', regex=True).str.lower().str.split()
for i in range(len(df['headline'])):
for j, word in enumerate(df["headline"][i]):
word = lematize(word)
df["headline"][i][j] = word
df['headline'][i] = ' '.join(df['headline'][i])
# df = df.dropna()
#vectorization
X = vectorizer.transform(df["headline"])
#prediction
df['prediction'] = model.predict(X)
return df
wp_data = get_data("https://www.wp.pl/", "div", "wp-teaser-text-shadow w-full text-white group-hover:underline text-xl/6")
onet_data = get_data("https://www.onet.pl/", "tag", "h3")
tvn24_data = get_data("https://tvn24.pl/", "tag", "h2")
interia_data = get_data("https://www.interia.pl/", "span", "title-text")
rmf_data = get_data("https://www.rmf24.pl/", "span", "indent")
data = pd.concat([wp_data, onet_data, tvn24_data, interia_data, rmf_data]).reset_index(drop=True)
prediction = find_clickbaits(data)
def create_chart(*args):
chart = {}
for arg in args:
chart[arg] = sum(prediction['prediction'][prediction['url'] == arg])/len(prediction['prediction'][prediction['url'] == arg])*100
courses = list(chart.keys())
values = list(chart.values())
fig = plt.figure(figsize = (10, 5))
# creating the bar plot
plt.bar(courses, values, color ='maroon',
width = 0.4)
plt.xlabel("Website")
plt.ylabel("% of clickbaits")
plt.title("Percentage of clickbaits in headlines")
plt.show()
create_chart("https://www.wp.pl/", "https://www.onet.pl/", "https://tvn24.pl/", "https://www.interia.pl/", "https://www.rmf24.pl/")