import community
import os
import copy
import random
import pandas as pd
import polars as pl
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from src import process_data, utils, stats
from tqdm.notebook import trange
from scipy.stats import norm
from pathlib import Path
from typing import Union
from netwulf import visualize, draw_netwulf
import nltk
import re
import wordcloud
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
import transformers
import pickle
import tqdm
from collections import defaultdict
plt.style.use("ggplot")
def display_louvain(graph: nx.Graph, config: dict):
graph = nx.Graph(graph)
partition = community.best_partition(graph, random_state=42)
print(f"The {graph.name} was partitioned into {len(list(set(partition.values())))} communities")
nx.set_node_attributes(graph, partition, "group")
network, _ = visualize(graph, config=config)
return network
def scale_nodes(graph: nx.Graph):
output = nx.get_node_attributes(graph, "output")
for n, data in graph.nodes(data=True):
data["size"] = output[n]
The data used for this project is a subset of All the News 2.0, which contains 2,677,878 news articles American publications spanning from January 1, 2016 to April 2, 2020.
We chose to work with articles from the two publications with the most articles in the dataset: Reuters with 840,094 articles, and The New York Times (NYT) with 252,259 authors.
The dataset contains 10 features: authors, title, article text, url, section, publication and four features describing the time of publication. The features used in this project are: authors, article text, and section.
We chose this dataset because we wanted to work with news articles as it has an apparent connection between a collaboration network and text, which fits great with the tools that we have used throughout this course. This dataset in particular was chosen because of its size and because it concerns American publications which also makes the text analysis more feasible as tools like stop words, and language models are easily accessible for english.
Our goal was to show how authors collaborate within the two newspapers and how they portray the different subjects. We wanted to condense the most essential points into the front page to make for an easy read. We then refer to the data, network, and text pages for more details and refer to the explainer notebook for ALL the details.
The dataset has the following 10 features:
We chose to keep author, title, article, url, section, and publication
The most important features are author, section, and article as we cannot do the analysis without.
Our preprocessing works in the following steps:
Step 4 is performed since we are mainly interested in the largest sections as the analysis will be more accurate and it's likely that a section in Reuters with a lot of articles, also is a section in The New York Times. This is important since we want to look at how the publications portray the same subjects. An added benefit of step 4 is that the data contained quite a few sections with only a single article. By looking at the names of these sections it became clear that these sections are not real sections but must have been assigned that name by a mistake doing the data gathering.
Step 5 also has dual purpose: We want to look at authors that are representative of their publication, and if they have written very few articles, they are probably freelancers. Secondly, some of the articles in The New York Times have mistakenly assigned celebrities as authors such as John Cena. By removing authors with fewer than 5 articles, we get rid of most of the celebrities that are clearly not authors.
Step 6 is taken since these articles don't contribute to the collaboration graph.
def remove_erroneous_samples(df: pl.LazyFrame, publication: str) -> pl.DataFrame:
# only select articles from the specific publication
articles = df.filter(pl.col("publication") == publication)
# remove articles that either don't have any authors, don't have any text, or don't have a section
articles = articles.drop_nulls(["author", "section", "article"])
# remove articles that has less than two authors
articles = articles.filter(pl.col("author").str.contains(","))
articles = articles.collect()
# remove articles with sections that has less than 100 articles
counts_df = articles.groupby("section").agg(pl.count())
counts_df = counts_df.filter(pl.col("count") > 100)
articles = articles.filter(pl.col("section").is_in(counts_df["section"]))
return articles
# the authors is represented as a comma seperated string.
# we split them into a list
def split_authors(df: pl.DataFrame) -> pl.DataFrame:
df = df.with_columns(
(
pl.col("author").apply(
lambda x: list(set(list(map(str.strip, x.title().split(", ")))))
)
).alias("authors")
)
df = df.drop("author")
return df
def remove_low_output_authors(df: pd.DataFrame, min_articles: int) -> pd.DataFrame:
author_output = {}
for i, row in df.iterrows():
for author in row["authors"]:
author_output[author] = author_output.get(author, 0) + 1
rows_to_remove = []
for i, row in df.iterrows():
authors = row["authors"]
authors = [a for a in authors if author_output[a] >= min_articles]
if len(authors) < 2:
rows_to_remove.append(i)
else:
df["authors"][i] = authors
df = df.drop(rows_to_remove)
return df
# load the data
load_path = Path("data/raw", "all-the-news-2-1.csv")
df = pl.scan_csv(load_path).select(["date", "author", "article", "title", "url", "section", "publication"])
min_articles = 5
# process
reuters_df = remove_erroneous_samples(df, "Reuters")
reuters_df = split_authors(reuters_df)
reuters_df = reuters_df.to_pandas()
reuters_df = remove_low_output_authors(reuters_df, min_articles)
nyt_df = remove_erroneous_samples(df, "The New York Times")
nyt_df = split_authors(nyt_df)
nyt_df = nyt_df.to_pandas()
nyt_df = remove_low_output_authors(nyt_df, min_articles)
# since we have a list of authors, we can no longer use a csv
# and instead use the json format
reuters_df.to_json("data/processed/reuters.json")
nyt_df.to_json("data/processed/nyt.json")
print(f"Reuters articles after preprocessing: {len(reuters_df)}")
print(f"NYT articles after preprocessing: {len(nyt_df)}")
Reuters articles after preprocessing: 57129 NYT articles after preprocessing: 23796
After processing, our data now has the following form
reuters_df.head(2)
date | article | title | url | section | publication | authors | |
---|---|---|---|---|---|---|---|
0 | 2019-06-23 00:00:00 | (Reuters) - The success of Hudson’s Bay Co Exe... | Hudson's Bay's chairman's buyout bid pits reta... | https://www.reuters.com/article/hudsons-bay-ma... | Business News | Reuters | [Harry Brumpton, Jessica Dinapoli] |
1 | 2019-02-05 00:00:00 | LONDON (Reuters) - Britain’s financial service... | Exclusive: Britain's financial heartland unbow... | https://www.reuters.com/article/us-britain-eu-... | Business News | Reuters | [Andrew Macaskill, Simon Jessop] |
nyt_df.head(2)
date | article | title | url | section | publication | authors | |
---|---|---|---|---|---|---|---|
0 | 2016-01-13 22:02:33 | Last Friday a group of 15 cancer researchers c... | ‘Moonshot’ to Cure Cancer, to Be Led by Biden,... | http://www.nytimes.com/2016/01/14/health/moons... | health | The New York Times | [Gardiner Harris, Gina Kolata] |
1 | 2016-01-17 21:20:09 | WASHINGTON — For a year, Obama administration ... | 14 Testy Months Behind U.S. Prisoner Swap With... | http://www.nytimes.com/2016/01/18/us/politics/... | us | The New York Times | [David E. Sanger, Peter Baker] |
The All the News 2.0 dataset contains 2,677,878 articles, totaling 8.8 GB of data.
Next we'll calculate some more interesting statstics
# load dataframe
reuters_df = pd.read_json("data/processed/reuters.json")
nyt_df = pd.read_json("data/processed/nyt.json")
reuters_df = pl.from_pandas(reuters_df)
nyt_df = pl.from_pandas(nyt_df)
print(f"The processed Reuters dataset has {len(reuters_df['section'].unique())} sections")
print(f"The processed New York Times dataset has {len(nyt_df['section'].unique())} sections")
The processed Reuters dataset has 39 sections The processed New York Times dataset has 21 sections
# find top 10 sections from each publication
def get_top_k_sections(df: pl.DataFrame, k=10) -> pl.DataFrame:
return df.groupby("section").agg(pl.count()).top_k(k, by="count")
def get_all_sections(df: pl.DataFrame):
return df.groupby("section").agg(pl.count()).sort("count")
def plot_top_sections(df: pl.DataFrame, title: str, figsize=(7, 3)):
n_sections = len(df)
fig, ax = plt.subplots(figsize=figsize)
plt.bar(range(n_sections), df["count"])
plt.xticks(range(n_sections), df["section"])
plt.title(title)
plt.show()
# store all the sections in a file for further inspection
reuters_sections = get_all_sections(reuters_df)
reuters_sections.write_csv("data/processed/reuters_sections.csv")
reuters_top_sections = get_top_k_sections(reuters_df, 5)
plot_top_sections(reuters_top_sections, "Reuters top sections")
nyt_sections = get_all_sections(nyt_df)
nyt_sections.write_csv("data/processed/nyt_sections.csv")
nyt_top_sections = get_top_k_sections(nyt_df, 5)
plot_top_sections(nyt_top_sections, "NYT top sections")
def get_stats(df: pd.DataFrame):
stats = {}
stats["#articles"] = len(df)
stats["#sections"] = len(df.groupby("section").count())
authors = set()
article_lenghts = []
for i, row in df.iterrows():
authors = authors.union(set(row["authors"]))
article_lenghts.append(len(row["article"].split()))
stats["#authors"] = len(authors)
stats["avg_words"] = np.mean(article_lenghts)
stats["avg_articles_per_author"] = len(df) / len(authors)
return stats
def print_stats(stats: dict):
for k, v in stats.items():
print(f"{k}: {v}")
print_stats(get_stats(reuters_df.to_pandas()))
#articles: 57129 #sections: 39 #authors: 1951 avg_words: 621.9692100334331 avg_articles_per_author: 29.281906714505382
print_stats(get_stats(nyt_df.to_pandas()))
#articles: 23796 #sections: 21 #authors: 1318 avg_words: 1203.7454193982182 avg_articles_per_author: 18.0546282245827
article_lenghts = []
for i, row in nyt_df.to_pandas().iterrows():
article_lenghts.append(len(row["article"].split()))
fig, ax = plt.subplots()
ax.hist(article_lenghts, 25, rwidth=0.9)
ax.set_yscale("log")
plt.title("NYT words pr. article")
plt.show()
article_lenghts = []
for i, row in reuters_df.to_pandas().iterrows():
article_lenghts.append(len(row["article"].split()))
fig, ax = plt.subplots()
ax.hist(article_lenghts, 25, rwidth=0.9)
ax.set_yscale("log")
plt.title("Reuters words pr. article")
plt.show()
For both The New York Times and Reuters we construct networks based on collaboration of articles. A node represents an author and a connection between two nodes represent collaboration. The size of the nodes will be scaled by the number of articles an author has published. The networks are undirected and unweighted as the collaborative connection is mutual. With the two networks we will be able to discern the differences in how the two news agencies collaborate between authors.
def graph_stats(graph: nx.Graph):
print(f"{graph.name} has {graph.number_of_nodes()} nodes and {graph.number_of_edges()} edges")
degrees = [degree for _, degree in graph.degree]
print(f"Average degree: {np.mean(degrees)}")
print(f"Median degree: {np.median(degrees)}")
print(f"Average clustering coefficient: {nx.average_clustering(graph)}")
print(f"Section assortativity coefficient: {nx.attribute_assortativity_coefficient(graph, 'section')}")
def modularity(graph: nx.Graph, partitioning: dict):
communities = set(partitioning.values())
partitions = {c: [] for c in communities}
for node, community in partitioning.items():
partitions[community].append(node)
edges = graph.number_of_edges()
mod = 0
for partition in partitions.values():
community = graph.subgraph(partition)
lc = community.number_of_edges()
kc = sum([node[1] for node in graph.degree(partition)])
mod += ((lc / edges) - (kc / (2 * edges)) ** 2)
return mod
def plot_degree_distribution(graph: nx.Graph):
degrees = {}
for _, degree in graph.degree:
degrees[degree] = degrees.get(degree, 0) + 1
x = sorted(degrees.keys())
y = [degrees[degree] for degree in x]
plt.plot(x, y, color=(0.2, 0.4, 0.6, 0.6), marker="o", markersize=4)
plt.title(f"Degree distribution of {graph.name}")
plt.xlabel("Degree")
plt.ylabel("Count")
plt.show()
def largest_component(graph: nx.Graph):
return graph.subgraph(max(nx.connected_components(graph), key=len))
def build_graph(data: pl.DataFrame) -> nx.Graph:
graph = nx.Graph(name=f"{data['publication'][0]} graph")
graph_authors = set()
for authors in data["authors"]:
for author in authors:
graph_authors.add(author)
author_sections = {author: {} for author in graph_authors}
author_output = {}
for row in data.iter_rows(named=True):
for author in row["authors"]:
author_sections[author][row["section"]] = author_sections[author].get(row["section"], 0) + 1
author_output[author] = author_output.get(author, 0) + 1
for author in author_sections:
author_sections[author] = max(author_sections[author], key=author_sections[author].get)
graph.add_nodes_from(graph_authors)
# Automatic node coloring in netwulf based on the group attribute
nx.set_node_attributes(graph, author_sections, "section")
nx.set_node_attributes(graph, author_sections, "group")
nx.set_node_attributes(graph, author_output, "output")
for authors in data["authors"]:
for author_a in authors:
for author_b in authors:
if author_a != author_b:
graph.add_edge(author_a, author_b)
return graph
reuters_graph = build_graph(reuters_df)
reuters_gcc = largest_component(reuters_graph)
graph_stats(reuters_gcc)
plot_degree_distribution(reuters_gcc)
Reuters graph has 1947 nodes and 17131 edges Average degree: 17.59732922444787 Median degree: 12.0 Average clustering coefficient: 0.28962767619131086 Section assortativity coefficient: 0.33304534289980026
The graph stats are computed for the largest connected component of the filtered graph. We see that the mode of the degree distribution is 3. The median and average degree are quite high indicating that we may have a well connected network. The assortativity coefficient based on section is not extremely big meaning that there is collaboration within the section, but not only limited to it.
Below is the Reuters graph prior to removing authors with less than 5 articles written.
In the 3 figures below the Reuters graph is visualized. The first figure is the entire graph colored by author section, the second figure is the largest connected component colored by section and the last figure is colored by the Louvain communities.
random.seed(42)
scale_nodes(reuters_graph)
reuters_config = {"zoom": 0.8,
"node_gravity": 0.2,
"node_size": 25,
"link_width": 0.7,
"node_fill_color": "#123123",
"node_stroke_color": "#000000"}
_, _ = visualize(reuters_graph, config=reuters_config)
_, _ = visualize(reuters_gcc, config=reuters_config)
_ = display_louvain(reuters_gcc, reuters_config)
The Reuters graph was partitioned into 12 communities
reuters_sections = nx.get_node_attributes(reuters_gcc, "section")
print(reuters_sections["David Shepardson"])
print(reuters_sections["Steve Holland"])
print(reuters_sections["William James"])
print(reuters_sections["Arno Schuetze"])
Business News Politics World News Deals
Comparing the graph to the unfiltered graph, there is a lot lessnoise, with much fewer authors branching off the main component. There doesn't seem to be a lot of structure in the largest component colored by author section, where as the Louvain partition creates a more strucuted partitioning.
nyt_graph = build_graph(nyt_df)
nyt_gcc = largest_component(nyt_graph)
graph_stats(nyt_gcc)
plot_degree_distribution(nyt_gcc)
The New York Times graph has 1285 nodes and 12731 edges Average degree: 19.8147859922179 Median degree: 12.0 Average clustering coefficient: 0.3260379569410747 Section assortativity coefficient: 0.6091857885908596
As with the Reuters graph the stats are computed with regard to the largest connected component of the filtered graph. The average and median degree are quite large once again, indicating that the network is well connected. The clustering coefficient is very similar to that of Reuters, but the degree assortativity with respect to the section attribute is much larger than that of the Reuters graph, this could indicate that collaboration is more contained within sections for The New York Times.
As with the Reuters graph below the unfiltered New York Times graph can be seen
In the 3 figures below the New York Times graph is visualized. The first figure is the entire graph colored by author section, the second figure is the largest connected component colored by section, and the last figure is colored by the Louvain communities.
random.seed(42)
scale_nodes(nyt_graph)
nyt_config = {"zoom": 0.85,
"node_gravity": 0.2,
"node_size": 30,
"link_width": 0.7,
"node_fill_color": "#123123",
"node_stroke_color": "#000000"}
_, _ = visualize(nyt_graph, config=nyt_config)
_, _ = visualize(nyt_gcc, config=nyt_config)
_ = display_louvain(nyt_gcc, config=nyt_config)
The The New York Times graph was partitioned into 10 communities
nyt_sections = nx.get_node_attributes(nyt_gcc, "section")
print(nyt_sections["Maggie Haberman"])
print(nyt_sections["Glenn Kenny"])
print(nyt_sections["Eric Schmitt"])
print(nyt_sections["Sandra Stevenson"])
print(nyt_sections["Jesse Mckinley"])
print(nyt_sections["Jon Caramanica"])
us movies world briefing nyregion arts
By removing the authors with less than 5 authors in the New York Times graph we remove a majority of the "authors" from the large blob in the center, which represents the movie sector of the agency, many of these authors turned out to be actors or other types of celebrities, which may have just been interviewed in the article. Comparing the section and Louvain partitions, there are only subtle differences, which could indicate that The New York times has a better collaboration culture.
Now we perform a randomization experiment based on the modularity of the two graphs and compare the modularity of the two partitions for each graph.
def randomization_experiment(graph: nx.Graph, edge_swaps: int, partition: dict, trials: int = 200):
louvain = community.best_partition(graph)
print(f"Calculating modularities of randomized versions of {graph.name}")
modularities = np.zeros(trials)
for i in trange(trials):
unfrozen_graph = nx.Graph(graph)
randomized_graph = nx.double_edge_swap(unfrozen_graph, edge_swaps, edge_swaps * 5)
modularities[i] = modularity(randomized_graph, partition)
mu, sigma = modularities.mean(), modularities.std()
section_modularity = stats.modularity(graph, partition)
louvain_modularity = stats.modularity(graph, louvain)
x = np.linspace(modularities.min()-3*sigma, modularities.max()+3*sigma, 2000)
plt.hist(modularities, bins=10, rwidth=0.9, density=True, label="Randomized modularities", color="lightblue")
plt.plot(x, norm.pdf(x, loc=mu, scale=sigma))
plt.axvline(section_modularity, color="red", label="Section modularity")
plt.axvline(louvain_modularity, color="darkgreen", label="Louvain modularity")
plt.legend()
plt.xlabel("Modularity")
plt.title(f"Modularity experiment of {graph.name}")
plt.savefig(f"images/{graph.name.replace(' ', '_').lower()}_mod.png")
plt.show()
print(f"Mean of randomized modularities: {mu:.3f}, standard deviation of randomized modularities: {sigma:.3f}")
print(f"Section partition modularity: {section_modularity:.3f}")
print(f"Louvain partition modularity: {louvain_modularity:.3f}")
randomization_experiment(reuters_gcc, 20_000, nx.get_node_attributes(reuters_gcc, "section"), 500)
randomization_experiment(nyt_gcc, 15_000, nx.get_node_attributes(nyt_gcc, "section"), 500)
Calculating modularities of randomized versions of Reuters graph
0%| | 0/500 [00:00<?, ?it/s]
Mean of randomized modularities: 0.019, standard deviation of randomized modularities: 0.003 Section partition modularity: 0.233 Louvain partition modularity: 0.533 Calculating modularities of randomized versions of The New York Times graph
0%| | 0/500 [00:00<?, ?it/s]
Mean of randomized modularities: 0.037, standard deviation of randomized modularities: 0.003 Section partition modularity: 0.494 Louvain partition modularity: 0.524
For both the graphs the random partition is only slightly different from 0. For the Reuters graph the modularity of section partiton is $0.233$ and $0.528$ for the Louvain partition. Which was to be expected as the section partition seemed much more random, in comparison to the Louvain partition. In the New York Times graph the section and Louvain partitions modularity are almost indifferent, with the Louvain modularity slightly edging out the section partition.
An explanation as to why the New York Times section partition is so much akin to the Louvain partition can likely be attributed to the fact that The New York Times section partition consists of 21 sections and the Louvain partition consists of 10 communities. On the other hand the Reuters section partition consists of 39 sections and the Louvain partition consists of 12 communities, which means the sections of Reuters were condensed much more than the New York Times sections.
For the text analysis, we used word clouds with tf-idf and sentiment analysis with a transformer model.
Construct the sentiment pipeline and load the data from the processed files. we drop the date, title and publication columns as they are not needed for the text analysis.
sentiment_pipeline = pipeline(model="distilbert-base-uncased-finetuned-sst-2-english", revision="af0f99b", truncation=True,device=0)
nyt_df = pd.read_json("data/processed/nyt.json")
reuters_df = pd.read_json("data/processed/reuters.json")
nyt_articles = nyt_df.drop(columns = ["date", "title", "publication"])
reuters_articles = reuters_df.drop(columns = ["date", "title", "publication"])
Downloading: 0%| | 0.00/18.0k [00:00<?, ?B/s]
Gather the sections from the two agencies as well as the number of articles in each section. We sort the sections by the number of articles in them. We will use this order later.
Then we define our tokenizer needed to tokenize the articles. We use the stopwords from the wordcloud library and add some words that are common in the articles but don't add any value to the analysis.
nyt_sections = {}
for section in nyt_df["section"].unique():
nyt_sections[section] = len(nyt_df[nyt_df['section'] == section])
nyt_sections = sorted(nyt_sections, key=nyt_sections.get, reverse=True)
reuters_sections = {}
for section in reuters_df["section"].unique():
reuters_sections[section] = len(reuters_df[reuters_df['section'] == section])
reuters_sections = sorted(reuters_sections, key=reuters_sections.get, reverse=True)
class Tokenizer:
stopwords = wordcloud.STOPWORDS
stopwords.update(["mr","dr", "ms", "said"])
pattern = re.compile("[^a-z ]+")
def tokenize(self, text: str) -> list[str]:
text = text.lower()
text = self.pattern.sub("", text)
words = text.split()
words = [word for word in words if word not in self.stopwords]
return words
Gather the articles from each section into a list of lists. Each list contains the tokens of the articles in the section. We use the tokenizer defined above to tokenize the articles.
nyt_section_articles = [[] for _ in range(len(nyt_sections))]
reuters_section_articles = [[] for _ in range(len(reuters_sections))]
tokenizer = Tokenizer()
for i, row in nyt_articles.iterrows():
section = row["section"]
article = row["article"]
if not (article is np.nan):
tokens = tokenizer.tokenize(article)
nyt_section_articles[nyt_sections.index(section)].extend(tokens)
for i, row in reuters_articles.iterrows():
section = row["section"]
article = row["article"]
if not (pd.isnull(article)):
tokens = tokenizer.tokenize(article)
reuters_section_articles[reuters_sections.index(section)].extend(tokens)
With the tokenized articles we can now find the top 5 terms for each section. We use the nltk library to find the top 5 terms for each section. We print the top 5 terms for each section.
nyt_top5_terms = []
print("-----New York Times-----")
for i, section in enumerate(nyt_sections):
print(f"Top 5 terms for {section}")
counts = nltk.FreqDist(nyt_section_articles[i])
top5 = counts.most_common(5)
nyt_top5_terms.append([term for term, _ in top5])
print(top5)
print("")
-----New York Times----- Top 5 terms for us [('trump', 59005), ('president', 34654), ('house', 23602), ('people', 23415), ('one', 23156)] Top 5 terms for world [('government', 15437), ('united', 14673), ('people', 14090), ('one', 13869), ('states', 12109)] Top 5 terms for business [('company', 8178), ('new', 7115), ('will', 6521), ('one', 5721), ('trump', 5639)] Top 5 terms for nyregion [('new', 11770), ('york', 7503), ('city', 6374), ('police', 5638), ('one', 4914)] Top 5 terms for briefing [('new', 4928), ('briefing', 4610), ('us', 4574), ('trump', 4306), ('president', 4215)] Top 5 terms for movies [('movie', 2767), ('one', 2187), ('film', 2118), ('movies', 1300), ('time', 1262)] Top 5 terms for arts [('new', 3930), ('one', 2706), ('music', 1924), ('art', 1858), ('album', 1738)] Top 5 terms for technology [('facebook', 3901), ('company', 3830), ('companies', 2517), ('new', 2477), ('people', 2177)] Top 5 terms for sports [('game', 2815), ('first', 2346), ('one', 2310), ('will', 1925), ('team', 1914)] Top 5 terms for well [('children', 1572), ('parents', 1033), ('child', 798), ('one', 668), ('may', 665)] Top 5 terms for climate [('climate', 2315), ('change', 1113), ('new', 854), ('states', 775), ('trump', 768)] Top 5 terms for nytnow [('briefing', 915), ('us', 694), ('new', 670), ('want', 649), ('one', 618)] Top 5 terms for health [('health', 1704), ('people', 1113), ('new', 876), ('drug', 867), ('care', 842)] Top 5 terms for opinion [('gail', 1425), ('trump', 1423), ('will', 967), ('bret', 930), ('think', 881)] Top 5 terms for realestate [('slide', 360), ('show', 360), ('weeks', 270), ('properties', 266), ('house', 253)] Top 5 terms for style [('people', 635), ('one', 613), ('will', 448), ('new', 447), ('time', 383)] Top 5 terms for upshot [('health', 730), ('people', 643), ('trump', 619), ('states', 514), ('new', 492)] Top 5 terms for espanol [('de', 11866), ('la', 5582), ('que', 5536), ('en', 5206), ('el', 4322)] Top 5 terms for magazine [('patient', 550), ('one', 437), ('blood', 338), ('doctor', 331), ('back', 307)] Top 5 terms for theater [('broadway', 452), ('theater', 426), ('play', 419), ('new', 353), ('show', 346)] Top 5 terms for learning [('students', 675), ('times', 319), ('one', 295), ('think', 276), ('will', 255)]
reuters_top5_terms = []
print("-----Reuters-----")
for i, section in enumerate(reuters_sections):
print(f"Top 5 terms for {section}")
counts = nltk.FreqDist(reuters_section_articles[i])
top5 = counts.most_common(5)
reuters_top5_terms.append([term for term, _ in top5])
print(top5)
print("")
-----Reuters----- Top 5 terms for Business News [('us', 41118), ('will', 36109), ('percent', 35902), ('year', 34127), ('billion', 32223)] Top 5 terms for World News [('us', 35603), ('will', 30037), ('government', 26948), ('reuters', 26774), ('people', 21721)] Top 5 terms for Politics [('trump', 19439), ('us', 12234), ('house', 8719), ('president', 8172), ('republican', 6170)] Top 5 terms for Deals [('billion', 7179), ('deal', 4801), ('percent', 4798), ('company', 4627), ('reuters', 4599)] Top 5 terms for Commodities [('oil', 9091), ('year', 4722), ('will', 4443), ('reuters', 4055), ('million', 4004)] Top 5 terms for Technology News [('company', 3116), ('will', 2983), ('reuters', 2928), ('billion', 2867), ('year', 2404)] Top 5 terms for Intel [('percent', 2770), ('reuters', 2576), ('will', 2363), ('year', 1963), ('new', 1785)] Top 5 terms for Market News [('us', 2895), ('new', 2245), ('will', 1927), ('reuters', 1703), ('sp', 1526)] Top 5 terms for Bonds News [('us', 2690), ('year', 2516), ('market', 2225), ('will', 2135), ('bank', 2026)] Top 5 terms for Environment [('will', 1962), ('reuters', 1790), ('new', 1610), ('us', 1541), ('oil', 1491)] Top 5 terms for U.S. Legal News [('us', 3092), ('new', 1539), ('will', 1507), ('reuters', 1491), ('trump', 1409)] Top 5 terms for Japan [('japan', 2166), ('will', 1629), ('billion', 1560), ('japanese', 1552), ('reuters', 1380)] Top 5 terms for Brexit [('brexit', 5292), ('eu', 5161), ('deal', 3953), ('will', 2870), ('britain', 2843)] Top 5 terms for Davos [('us', 2398), ('percent', 2149), ('year', 1788), ('trade', 1721), ('will', 1578)] Top 5 terms for Health News [('health', 1753), ('us', 1386), ('new', 1350), ('reuters', 1279), ('drug', 1240)] Top 5 terms for Asia [('reuters', 1320), ('year', 1187), ('will', 1162), ('china', 1073), ('government', 925)] Top 5 terms for U.S. [('us', 2178), ('new', 1324), ('reuters', 1060), ('people', 1053), ('york', 846)] Top 5 terms for Financials [('bank', 1405), ('banks', 921), ('will', 904), ('reuters', 822), ('year', 770)] Top 5 terms for Wealth [('percent', 1506), ('billion', 1191), ('bank', 1188), ('us', 1150), ('fund', 1081)] Top 5 terms for Company News [('us', 1489), ('will', 1073), ('reuters', 1042), ('new', 824), ('year', 809)] Top 5 terms for Hot Stocks [('ftse', 909), ('percent', 777), ('shares', 758), ('index', 600), ('year', 554)] Top 5 terms for Entertainment News [('film', 714), ('new', 684), ('will', 652), ('reuters', 590), ('weinstein', 586)] Top 5 terms for Sports News [('games', 868), ('olympic', 697), ('will', 652), ('reuters', 598), ('world', 573)] Top 5 terms for Credit RSS [('us', 710), ('reuters', 698), ('will', 575), ('year', 457), ('new', 437)] Top 5 terms for Sustainable Business [('will', 771), ('climate', 761), ('reuters', 663), ('companies', 638), ('new', 625)] Top 5 terms for Funds News [('reuters', 570), ('us', 545), ('new', 524), ('billion', 522), ('will', 499)] Top 5 terms for Foreign Exchange Analysis [('sterling', 1011), ('brexit', 853), ('pound', 764), ('currency', 619), ('percent', 603)] Top 5 terms for Fintech [('bitcoin', 913), ('reuters', 698), ('cryptocurrency', 684), ('new', 628), ('will', 584)] Top 5 terms for Healthcare [('people', 642), ('coronavirus', 619), ('reuters', 474), ('will', 431), ('health', 394)] Top 5 terms for Cyber Risk [('us', 827), ('security', 823), ('cyber', 737), ('reuters', 536), ('data', 535)] Top 5 terms for Cyclical Consumer Goods [('reuters', 455), ('will', 385), ('us', 301), ('people', 298), ('new', 272)] Top 5 terms for Energy [('oil', 932), ('us', 570), ('will', 389), ('reuters', 373), ('crude', 355)] Top 5 terms for Basic Materials [('gold', 559), ('week', 376), ('prices', 358), ('demand', 346), ('reuters', 342)] Top 5 terms for Big Story 10 [('reuters', 429), ('year', 283), ('will', 275), ('people', 271), ('thomson', 217)] Top 5 terms for Special Reports [('reuters', 1572), ('one', 1036), ('us', 931), ('according', 918), ('police', 884)] Top 5 terms for Supreme Court [('court', 1209), ('trump', 656), ('supreme', 569), ('kavanaugh', 568), ('senate', 563)] Top 5 terms for Breakingviews [('breakingviews', 455), ('reuters', 434), ('financial', 320), ('new', 290), ('york', 235)] Top 5 terms for Lifestyle [('reuters', 185), ('percent', 183), ('year', 174), ('last', 142), ('will', 139)] Top 5 terms for Corrections News [('us', 218), ('will', 178), ('reuters', 166), ('new', 146), ('oil', 118)]
We define our stopwords and other again to use the same tokenizer setup with the Sklearn vectorizer library.
stopwords = wordcloud.STOPWORDS
stopwords.update(["mr","dr", "ms", "said"])
pattern = re.compile("[^a-z ]+")
cloud = WordCloud(background_color="white", width=1920, height=1080)
def tokenize(text: str) -> list[str]:
text = text.lower()
text = pattern.sub("", text)
words = text.split()
words = [word for word in words if word not in stopwords]
return words
We use the Sklearn vectorizer library to find the TF-IDF for each term in each section. We then use the wordcloud library to generate a wordcloud for each section.
comment savefig and uncomment show to show the wordclouds in the notebook.
First for NYT.
for sec in nyt_sections:
nyt_section_articles = nyt_articles[nyt_articles["section"] == sec]["article"].tolist()
nyt_tfidfs_section_vec = TfidfVectorizer(tokenizer=tokenize)
nyt_tfidfs_section_vec.fit(nyt_section_articles)
nyt_tfidfs_section = nyt_tfidfs_section_vec.transform(nyt_section_articles).toarray()
nyt_tfidfs_section = pd.Series(nyt_tfidfs_section.mean(axis=0), index=nyt_tfidfs_section_vec.get_feature_names_out()).sort_values(ascending=False).to_dict()
CLOUD = cloud.generate_from_frequencies(nyt_tfidfs_section)
plt.imshow(CLOUD)
plt.axis("off")
plt.title(f"NYT TF-IDF {sec}")
plt.savefig(f"images/NYT_TFIDF_{sec}.png", bbox_inches="tight", dpi=300)
#plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn(
Then for Reuters.
for sec in reuters_sections:
reuters_section_articles = reuters_articles[reuters_articles["section"] == sec]["article"].tolist()
reuters_tfidfs_section_vec = TfidfVectorizer(tokenizer=tokenize)
reuters_tfidfs_section_vec.fit(reuters_section_articles)
reuters_tfidfs_section = reuters_tfidfs_section_vec.transform(reuters_section_articles).toarray()
reuters_tfidfs_section = pd.Series(reuters_tfidfs_section.mean(axis=0), index=reuters_tfidfs_section_vec.get_feature_names_out()).sort_values(ascending=False).to_dict()
CLOUD = cloud.generate_from_frequencies(reuters_tfidfs_section)
plt.imshow(CLOUD)
plt.axis("off")
plt.title(f"Reuters TF-IDF {sec}")
plt.savefig(f"images/Reuters_TFIDF_{sec}.png", bbox_inches="tight", dpi=300)
#plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn(
we now take a look at the TF-IDF for the entire Agencies.
nyt_all_articles = "".join(nyt_articles["article"])
reuters_all_articles = "".join(reuters_articles["article"])
# TF-IDF on all articles
nyt_tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize)
nyt_tfidf_vectorizer.fit([nyt_all_articles])
nyt_tfidf = nyt_tfidf_vectorizer.transform([nyt_all_articles])
nyt_tfidf = nyt_tfidf.toarray()[0]
nyt_tfidf = pd.Series(nyt_tfidf, index=nyt_tfidf_vectorizer.get_feature_names_out())
nyt_tfidf = nyt_tfidf.sort_values(ascending=False)
nyt_tfidf = nyt_tfidf.to_dict()
reuters_tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize)
reuters_tfidf_vectorizer.fit([reuters_all_articles])
reuters_tfidf = reuters_tfidf_vectorizer.transform([reuters_all_articles])
reuters_tfidf = reuters_tfidf.toarray()[0]
reuters_tfidf = pd.Series(reuters_tfidf, index=reuters_tfidf_vectorizer.get_feature_names_out())
reuters_tfidf = reuters_tfidf.sort_values(ascending=False)
reuters_tfidf = reuters_tfidf.to_dict()
CLOUD = cloud.generate_from_frequencies(nyt_tfidf)
plt.imshow(CLOUD)
plt.axis("off")
plt.title(f"NYT TF-IDF")
plt.savefig(f"images/NYT_TFIDF.png", bbox_inches="tight", dpi=300)
plt.show()
CLOUD = cloud.generate_from_frequencies(reuters_tfidf)
plt.imshow(CLOUD)
plt.axis("off")
plt.title(f"Reuters TF-IDF")
plt.savefig(f"images/Reuters_TFIDF.png", bbox_inches="tight", dpi=300)
plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn( C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None' warnings.warn(
Starting the sentiment analysis we go over the articles and calculate the sentiment for each article. We then add the sentiment to the agencies dataframe. The sentiment are expressed using either a positive or negative label, then a score between 1 and 0 which determines the confidence of the label.
Before we can do the sentiment analysis we have to reconstruct the articles so they are in a format that fits the pipeline. The pipeline needs a list of strings, where each string is a sentence or article. However, we found that the whole article was too long for the pipeline to handle, so we decided on only analyzing the first 512 words of each article.
The analysis pipeline has been enabled to use GPU, which speeds up the analysis significantly. However, it is not necessary to use GPU, and the pipeline will work without it. This can be modified in the first cell of the notebook. When done the sentiment analysis is saved to a json file for later use.
# reset index of dataframe
nyt_articles = nyt_articles.reset_index(drop=True)
reuters_articles = reuters_articles.reset_index(drop=True)
shortArticles = []
for article in nyt_articles["article"]:
articleShort = " ".join(article.split(" ")[:512])
shortArticles.append(articleShort)
sentiments = sentiment_pipeline(shortArticles, padding=True, truncation=True)
nyt_articles["Sentiment"] = ""
nyt_articles["Sen Score"] = ""
for i in range(len(sentiments)):
sentiment = sentiments[i]
nyt_articles["Sentiment"][i] = sentiment["label"]
nyt_articles["Sen Score"][i] = sentiment["score"]
nyt_articles.to_json("data/processed/nyt_language_sentiment.json")
shortArticles = []
for article in reuters_articles["article"]:
articleShort = " ".join(article.split(" ")[:512])
shortArticles.append(articleShort)
sentiments = sentiment_pipeline(shortArticles, padding=True, truncation=True)
reuters_articles["Sentiment"] = ""
reuters_articles["Sen Score"] = ""
for i in range(len(sentiments)):
sentiment = sentiments[i]
reuters_articles["Sentiment"][i] = sentiment["label"]
reuters_articles["Sen Score"][i] = sentiment["score"]
reuters_articles.to_json("data/processed/reuters_language_sentiment.json")
For this type of dataset it is not optimal that we only have a positive and negative label. We would like to have a neutral label as well. We therefore decided to use the confidence score to determine if an article is neutral. If the score is less than 0.9 we consider the article neutral. This is done in the following cell.
nyt_articles = pd.read_json("data/processed/nyt_language_sentiment.json")
reuters_articles = pd.read_json("data/processed/reuters_language_sentiment.json")
# all articles with sen score less than 0.9 are considered neutral
nyt_articles["Sen Score"] = nyt_articles["Sen Score"].astype(float)
nyt_articles["Sentiment"] = nyt_articles["Sentiment"].astype(str)
nyt_articles["Sentiment"][nyt_articles["Sen Score"] < 0.9] = "NEUTRAL"
nyt_articles["Sentiment"] = nyt_articles["Sentiment"].astype("category")
reuters_articles["Sen Score"] = reuters_articles["Sen Score"].astype(float)
reuters_articles["Sentiment"] = reuters_articles["Sentiment"].astype(str)
reuters_articles["Sentiment"][reuters_articles["Sen Score"] < 0.9] = "NEUTRAL"
reuters_articles["Sentiment"] = reuters_articles["Sentiment"].astype("category")
C:\Users\KrisK\AppData\Local\Temp\ipykernel_25084\3412638928.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy nyt_articles["Sentiment"][nyt_articles["Sen Score"] < 0.9] = "NEUTRAL" C:\Users\KrisK\AppData\Local\Temp\ipykernel_25084\3412638928.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy reuters_articles["Sentiment"][reuters_articles["Sen Score"] < 0.9] = "NEUTRAL"
The following is the results of the sentiment analysis on the entire agencies. We have calculated the average sentiment score, the max and min score, and the distribution of the sentiment labels.
reuters_sentiments_score = np.asarray(reuters_articles["Sen Score"].tolist())
reuters_sentiment_avg_score = reuters_sentiments_score.mean()
reuters_sentiment_max = reuters_sentiments_score.max()
reuters_sentiment_min = reuters_sentiments_score.min()
print(f"Reuters Sentiment Average Score: {reuters_sentiment_avg_score}")
print(f"Reuters Sentiment Max: {reuters_sentiment_max}")
print(f"Reuters Sentiment Min: {reuters_sentiment_min}")
print("")
reuters_sentiments = reuters_articles["Sentiment"]
print(f"Reuters Sentiment Distribution: {reuters_sentiments.value_counts()}")
print("\n")
nyt_sentiments_score = np.asarray(nyt_articles["Sen Score"].tolist())
nyt_sentiment_avg_score = nyt_sentiments_score.mean()
nyt_sentiment_max = nyt_sentiments_score.max()
nyt_sentiment_min = nyt_sentiments_score.min()
print(f"NYT Sentiment Average Score: {nyt_sentiment_avg_score}")
print(f"NYT Sentiment Max: {nyt_sentiment_max}")
print(f"NYT Sentiment Min: {nyt_sentiment_min}")
print("")
nyt_sentiments = nyt_articles["Sentiment"]
print(f"NYT Sentiment Distribution: {nyt_sentiments.value_counts()}")
Reuters Sentiment Average Score: 0.9631492433844178 Reuters Sentiment Max: 0.9997518659000001 Reuters Sentiment Min: 0.5003492236 Reuters Sentiment Distribution: Sentiment NEGATIVE 47651 NEUTRAL 5812 POSITIVE 3666 Name: count, dtype: int64 NYT Sentiment Average Score: 0.950676645385258 NYT Sentiment Max: 0.9997653365 NYT Sentiment Min: 0.5000764728 NYT Sentiment Distribution: Sentiment NEGATIVE 17376 NEUTRAL 3384 POSITIVE 3036 Name: count, dtype: int64
Earlier we found that the TF-IDF analysis of NYT US and Reuters Politics had a high similarity. We therefore decided to look at the sentiment analysis of these two sections. We use the same principal as before.
shortArticles = []
# all articles in section US from NYT
for article in nyt_articles[nyt_articles["section"] == "us"]["article"]:
articleShort = " ".join(article.split(" ")[:512])
shortArticles.append(articleShort)
sentiments = sentiment_pipeline(shortArticles, padding=True, truncation=True)
# move the sentiments to np.array
nyt_score_us = np.asarray([sentiment["score"] for sentiment in sentiments])
nyt_label_us = np.asarray([sentiment["label"] for sentiment in sentiments])
# every score less than 0.9 is considered neutral
nyt_label_us[nyt_score_us < 0.9] = "NEUTRAL"
nyt_label_us = pd.DataFrame(nyt_label_us)
shortArticles = []
# all articles in section Politics from Reuters
for article in reuters_articles[reuters_articles["section"] == "Politics"]["article"]:
articleShort = " ".join(article.split(" ")[:512])
shortArticles.append(articleShort)
sentiments = sentiment_pipeline(shortArticles, padding=True, truncation=True)
# move the sentiments to np.array
reuters_score_poli = np.asarray([sentiment["score"] for sentiment in sentiments])
reuters_label_poli = pd.DataFrame(np.asarray([sentiment["label"] for sentiment in sentiments]))
# every score less than 0.9 is considered neutral
reuters_label_poli[reuters_score_poli < 0.9] = "NEUTRAL"
reuters_label_poli = pd.DataFrame(reuters_label_poli)
With the sentiment analysis of the two sections we can now compare the results.
print(f"NYT_US Sentiment Average Score: {nyt_score_us.mean()}")
print(f"NYT_US Sentiment Max: {nyt_score_us.max()}")
print(f"NYT_US Sentiment Min: {nyt_score_us.min()}")
print("")
print(f"NYT_US Sentiment Distribution: \n{nyt_label_us.value_counts()}")
print("\n")
print(f"Reuters_Politics Sentiment Average Score: {reuters_score_poli.mean()}")
print(f"Reuters_Politics Sentiment Max: {reuters_score_poli.max()}")
print(f"Reuters_Politics Sentiment Min: {reuters_score_poli.min()}")
print("")
print(f"Reuters_Politics Sentiment Distribution: \n{reuters_label_poli.value_counts()}")
NYT_US Sentiment Average Score: 0.9508135075220069 NYT_US Sentiment Max: 0.9997653365135193 NYT_US Sentiment Min: 0.5001206398010254 NYT_US Sentiment Distribution: NEGATIVE 5938 NEUTRAL 1118 POSITIVE 771 Name: count, dtype: int64 Reuters_Politics Sentiment Average Score: 0.9583817098263969 Reuters_Politics Sentiment Max: 0.9996844530105591 Reuters_Politics Sentiment Min: 0.5024705529212952 Reuters_Politics Sentiment Distribution: NEGATIVE 2472 NEUTRAL 353 POSITIVE 212 Name: count, dtype: int64
We find that the sentiment for the two sections are kinda similar. However, the most interesting thing here is that the scores follow the overall sentiment for the respective agencies.
To compute the sentiments on an author basis, we start with computing all the sentiments for each article in the Reuters politics section and the NYT us section
nyt_df = pd.read_json("data/processed/nyt.json")
reuters_df = pd.read_json("data/processed/reuters.json")
nyt_us = nyt_df[nyt_df["section"] == "us"]
reuters_politics = reuters_df[reuters_df["section"] == "Politics"]
classifier = transformers.pipeline(model="distilbert-base-uncased-finetuned-sst-2-english", truncation=True, device=0)
Downloading: 0%| | 0.00/18.0k [00:00<?, ?B/s]
def get_sentiments(df):
sentiments = []
for i, row in tqdm.tqdm(df.iterrows()):
s = classifier(row["article"])[0]
if s["score"] < 0.9:
s["label"] = "NEUTRAL"
sentiments.append(s)
return sentiments
nyt_sentiments = get_sentiments(nyt_us)
10it [00:00, 45.46it/s]C:\ProgramData\Anaconda3\lib\site-packages\transformers\pipelines\base.py:1043: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset warnings.warn( 7827it [01:23, 94.15it/s]
reuters_sentiments = get_sentiments(reuters_politics)
0it [00:00, ?it/s]C:\ProgramData\Anaconda3\lib\site-packages\transformers\pipelines\base.py:1043: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset warnings.warn( 3037it [00:25, 117.63it/s]
Saving the sentiments so we don't have to compute them again
with open("nyt_sentiments.pkl", "wb") as f:
pickle.dump(nyt_sentiments, f)
with open("reuters_sentiments.pkl", "wb") as f:
pickle.dump(reuters_sentiments, f)
sentiment_map = {"NEGATIVE": 0, "NEUTRAL": 1, "POSITIVE": 2}
With the sentiments for all the articles, we count the number of times an author has coauthored an article with each sentiment. With these counts, we can then determine in which sentiment the author has written the majority of their articles in
def get_author_sentiments(df, sentiments):
authors = defaultdict(lambda: [0, 0, 0])
c = 0
for i, row in df.iterrows():
s = sentiment_map[sentiments[c]["label"]]
c += 1
for author in row["authors"]:
authors[author][s] += 1
return authors
nyt_author_sentiments = get_author_sentiments(nyt_us, nyt_sentiments)
reuters_author_sentiments = get_author_sentiments(reuters_politics, reuters_sentiments)
def plot_major_sentiment(author_sentiments):
sentiment_counts = [0, 0, 0]
for sentiments in author_sentiments.values():
sentiment_counts[np.argmax(sentiments)] += 1
fig, ax = plt.subplots()
bars = ax.bar(range(3), sentiment_counts)
ax.set_xticks(range(3), ["Negative", "Neutral", "Positive"])
ax.bar_label(bars)
plot_major_sentiment(nyt_author_sentiments)
plt.title("NYT most frequent author sentiment")
plt.show()
plot_major_sentiment(reuters_author_sentiments)
plt.title("Reuters most frequent author sentiment")
plt.show()
Originally we wanted to explore the possible differences between more opposing news agencies, for example Fox News and The New York Times instead of Reuters and The New York Times. Investigating the dataset revealed that there were very few collaborations within the right-leaning news agencies, and therefore not much of a network to create.
We thought that the comparison of different politically oriented news papers would have provided for a more interesting text analysis and as we have discovered, we found very little difference between Reuters and the NYT when comparing texts, at least with the tools that we had available to us.
We think that the network analysis went really well with a lot of interesting plots and it was cool to see how the NYT network is really split into a larger and a smaller component
What we think is lacking, is the connection between the network analysis and the text analysis. In the text analysis, we ended up just comparing different sections between newspapers because we thought this would be the most interesting as we wanted to look at how different newspapers write about the same subject. If we had more time, it would have been interesting comparing more sections within each newspaper but also comparing the writing of the different communities, which would have provided a more explicit connection between the network analysis and the text analysis