import community
import os
import copy
import random
import pandas as pd
import polars as pl
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from src import process_data, utils, stats

from tqdm.notebook import trange
from scipy.stats import norm
from pathlib import Path
from typing import Union
from netwulf import visualize, draw_netwulf

import nltk
import re
import wordcloud
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
import transformers
import pickle
import tqdm
from collections import defaultdict

plt.style.use("ggplot")


def display_louvain(graph: nx.Graph, config: dict):
    graph = nx.Graph(graph)
    partition = community.best_partition(graph, random_state=42)
    print(f"The {graph.name} was partitioned into {len(list(set(partition.values())))} communities")
    nx.set_node_attributes(graph, partition, "group")
    network, _ = visualize(graph, config=config)
    return network


def scale_nodes(graph: nx.Graph):
    output = nx.get_node_attributes(graph, "output")
    for n, data in graph.nodes(data=True):
        data["size"] = output[n]


def remove_erroneous_samples(df: pl.LazyFrame, publication: str) -> pl.DataFrame:
    # only select articles from the specific publication
    articles = df.filter(pl.col("publication") == publication)

    # remove articles that either don't have any authors, don't have any text, or don't have a section
    articles = articles.drop_nulls(["author", "section", "article"])

    # remove articles that has less than two authors
    articles = articles.filter(pl.col("author").str.contains(","))

    articles = articles.collect()

    # remove articles with sections that has less than 100 articles
    counts_df = articles.groupby("section").agg(pl.count())
    counts_df = counts_df.filter(pl.col("count") > 100)
    articles = articles.filter(pl.col("section").is_in(counts_df["section"]))

    return articles


# the authors is represented as a comma seperated string.
# we split them into a list
def split_authors(df: pl.DataFrame) -> pl.DataFrame:
    df = df.with_columns(
        (
            pl.col("author").apply(
                lambda x: list(set(list(map(str.strip, x.title().split(", ")))))
            )
        ).alias("authors")
    )
    df = df.drop("author")
    return df

def remove_low_output_authors(df: pd.DataFrame, min_articles: int) -> pd.DataFrame:
    author_output = {}
    for i, row in df.iterrows():
        for author in row["authors"]:
            author_output[author] = author_output.get(author, 0) + 1

    rows_to_remove = []
    for i, row in df.iterrows():
        authors = row["authors"]
        authors = [a for a in authors if author_output[a] >= min_articles]
        if len(authors) < 2:
            rows_to_remove.append(i)
        else:
            df["authors"][i] = authors

    df = df.drop(rows_to_remove)

    return df


# load the data
load_path = Path("data/raw", "all-the-news-2-1.csv")
df = pl.scan_csv(load_path).select(["date", "author", "article", "title", "url", "section", "publication"])
min_articles = 5

# process
reuters_df = remove_erroneous_samples(df, "Reuters")
reuters_df = split_authors(reuters_df)
reuters_df = reuters_df.to_pandas()
reuters_df = remove_low_output_authors(reuters_df, min_articles)

nyt_df = remove_erroneous_samples(df, "The New York Times")
nyt_df = split_authors(nyt_df)
nyt_df = nyt_df.to_pandas()
nyt_df = remove_low_output_authors(nyt_df, min_articles)

# since we have a list of authors, we can no longer use a csv
# and instead use the json format
reuters_df.to_json("data/processed/reuters.json")
nyt_df.to_json("data/processed/nyt.json")


print(f"Reuters articles after preprocessing: {len(reuters_df)}")
print(f"NYT articles after preprocessing: {len(nyt_df)}")

Reuters articles after preprocessing: 57129
NYT articles after preprocessing: 23796


reuters_df.head(2)


nyt_df.head(2)


# load dataframe
reuters_df = pd.read_json("data/processed/reuters.json")
nyt_df = pd.read_json("data/processed/nyt.json")

reuters_df = pl.from_pandas(reuters_df)
nyt_df = pl.from_pandas(nyt_df)


print(f"The processed Reuters dataset has {len(reuters_df['section'].unique())} sections")
print(f"The processed New York Times dataset has {len(nyt_df['section'].unique())} sections")

The processed Reuters dataset has 39 sections
The processed New York Times dataset has 21 sections


# find top 10 sections from each publication
def get_top_k_sections(df: pl.DataFrame, k=10) -> pl.DataFrame:
    return df.groupby("section").agg(pl.count()).top_k(k, by="count")

def get_all_sections(df: pl.DataFrame):
    return df.groupby("section").agg(pl.count()).sort("count")


def plot_top_sections(df: pl.DataFrame, title: str, figsize=(7, 3)):
    n_sections = len(df)
    fig, ax = plt.subplots(figsize=figsize)
    plt.bar(range(n_sections), df["count"])
    plt.xticks(range(n_sections), df["section"])
    plt.title(title)
    plt.show()


# store all the sections in a file for further inspection
reuters_sections = get_all_sections(reuters_df)
reuters_sections.write_csv("data/processed/reuters_sections.csv")


reuters_top_sections = get_top_k_sections(reuters_df, 5)
plot_top_sections(reuters_top_sections, "Reuters top sections")


nyt_sections = get_all_sections(nyt_df)
nyt_sections.write_csv("data/processed/nyt_sections.csv")


nyt_top_sections = get_top_k_sections(nyt_df, 5)
plot_top_sections(nyt_top_sections, "NYT top sections")


def get_stats(df: pd.DataFrame):
    stats = {}
    stats["#articles"] = len(df)
    stats["#sections"] = len(df.groupby("section").count())

    authors = set()
    article_lenghts = []
    for i, row in df.iterrows():
        authors = authors.union(set(row["authors"]))
        article_lenghts.append(len(row["article"].split()))
    stats["#authors"] = len(authors)
    stats["avg_words"] = np.mean(article_lenghts)
    stats["avg_articles_per_author"] = len(df) / len(authors)

    return stats

def print_stats(stats: dict):
    for k, v in stats.items():
        print(f"{k}: {v}")


print_stats(get_stats(reuters_df.to_pandas()))

#articles: 57129
#sections: 39
#authors: 1951
avg_words: 621.9692100334331
avg_articles_per_author: 29.281906714505382


print_stats(get_stats(nyt_df.to_pandas()))

#articles: 23796
#sections: 21
#authors: 1318
avg_words: 1203.7454193982182
avg_articles_per_author: 18.0546282245827


article_lenghts = []
for i, row in nyt_df.to_pandas().iterrows():
    article_lenghts.append(len(row["article"].split()))


fig, ax = plt.subplots()
ax.hist(article_lenghts, 25, rwidth=0.9)
ax.set_yscale("log")
plt.title("NYT words pr. article")
plt.show()


article_lenghts = []
for i, row in reuters_df.to_pandas().iterrows():
    article_lenghts.append(len(row["article"].split()))


fig, ax = plt.subplots()
ax.hist(article_lenghts, 25, rwidth=0.9)
ax.set_yscale("log")
plt.title("Reuters words pr. article")
plt.show()


def graph_stats(graph: nx.Graph):
    print(f"{graph.name} has {graph.number_of_nodes()} nodes and {graph.number_of_edges()} edges")

    degrees = [degree for _, degree in graph.degree]
    print(f"Average degree: {np.mean(degrees)}")
    print(f"Median degree: {np.median(degrees)}")

    print(f"Average clustering coefficient: {nx.average_clustering(graph)}")

    print(f"Section assortativity coefficient: {nx.attribute_assortativity_coefficient(graph, 'section')}")
    
def modularity(graph: nx.Graph, partitioning: dict):
    communities = set(partitioning.values())

    partitions = {c: [] for c in communities}

    for node, community in partitioning.items():
        partitions[community].append(node)

    edges = graph.number_of_edges()
    mod = 0
    for partition in partitions.values():
        community = graph.subgraph(partition)
        lc = community.number_of_edges()
        kc = sum([node[1] for node in graph.degree(partition)])

        mod += ((lc / edges) - (kc / (2 * edges)) ** 2)

    return mod

def plot_degree_distribution(graph: nx.Graph):
    degrees = {}
    for _, degree in graph.degree:
        degrees[degree] = degrees.get(degree, 0) + 1
    x = sorted(degrees.keys())
    y = [degrees[degree] for degree in x]
    plt.plot(x, y, color=(0.2, 0.4, 0.6, 0.6), marker="o", markersize=4)
    plt.title(f"Degree distribution of {graph.name}")
    plt.xlabel("Degree")
    plt.ylabel("Count")
    plt.show()
    
def largest_component(graph: nx.Graph):
    return graph.subgraph(max(nx.connected_components(graph), key=len))

def build_graph(data: pl.DataFrame) -> nx.Graph:
    graph = nx.Graph(name=f"{data['publication'][0]} graph")

    graph_authors = set()
    for authors in data["authors"]:
        for author in authors:
            graph_authors.add(author)

    author_sections = {author: {} for author in graph_authors}
    author_output = {}

    for row in data.iter_rows(named=True):
        for author in row["authors"]:
            author_sections[author][row["section"]] = author_sections[author].get(row["section"], 0) + 1
            author_output[author] = author_output.get(author, 0) + 1

    for author in author_sections:
        author_sections[author] = max(author_sections[author], key=author_sections[author].get)

    graph.add_nodes_from(graph_authors)

    # Automatic node coloring in netwulf based on the group attribute
    nx.set_node_attributes(graph, author_sections, "section")
    nx.set_node_attributes(graph, author_sections, "group")
    nx.set_node_attributes(graph, author_output, "output")

    for authors in data["authors"]:
        for author_a in authors:
            for author_b in authors:
                if author_a != author_b:
                    graph.add_edge(author_a, author_b)

    return graph


reuters_graph = build_graph(reuters_df)
reuters_gcc = largest_component(reuters_graph)
graph_stats(reuters_gcc)
plot_degree_distribution(reuters_gcc)

Reuters graph has 1947 nodes and 17131 edges
Average degree: 17.59732922444787
Median degree: 12.0
Average clustering coefficient: 0.28962767619131086
Section assortativity coefficient: 0.33304534289980026


random.seed(42)

scale_nodes(reuters_graph)

reuters_config = {"zoom": 0.8,
                  "node_gravity": 0.2,
                  "node_size": 25,
                  "link_width": 0.7,
                  "node_fill_color": "#123123",
                  "node_stroke_color": "#000000"}
_, _ = visualize(reuters_graph, config=reuters_config)
_, _ = visualize(reuters_gcc, config=reuters_config)
_ = display_louvain(reuters_gcc, reuters_config)

The Reuters graph was partitioned into 12 communities


reuters_sections = nx.get_node_attributes(reuters_gcc, "section")
print(reuters_sections["David Shepardson"])
print(reuters_sections["Steve Holland"])
print(reuters_sections["William James"])
print(reuters_sections["Arno Schuetze"])

Business News
Politics
World News
Deals


nyt_graph = build_graph(nyt_df)
nyt_gcc = largest_component(nyt_graph)
graph_stats(nyt_gcc)
plot_degree_distribution(nyt_gcc)

The New York Times graph has 1285 nodes and 12731 edges
Average degree: 19.8147859922179
Median degree: 12.0
Average clustering coefficient: 0.3260379569410747
Section assortativity coefficient: 0.6091857885908596


random.seed(42)

scale_nodes(nyt_graph)

nyt_config = {"zoom": 0.85,
              "node_gravity": 0.2,
              "node_size": 30,
              "link_width": 0.7,
              "node_fill_color": "#123123",
              "node_stroke_color": "#000000"}
_, _ = visualize(nyt_graph, config=nyt_config)
_, _ = visualize(nyt_gcc, config=nyt_config)
_ = display_louvain(nyt_gcc, config=nyt_config)

The The New York Times graph was partitioned into 10 communities


nyt_sections = nx.get_node_attributes(nyt_gcc, "section")
print(nyt_sections["Maggie Haberman"])
print(nyt_sections["Glenn Kenny"])
print(nyt_sections["Eric Schmitt"])
print(nyt_sections["Sandra Stevenson"])
print(nyt_sections["Jesse Mckinley"])
print(nyt_sections["Jon Caramanica"])

us
movies
world
briefing
nyregion
arts


def randomization_experiment(graph: nx.Graph, edge_swaps: int, partition: dict, trials: int = 200):
    louvain = community.best_partition(graph)
    
    print(f"Calculating modularities of randomized versions of {graph.name}")
    
    modularities = np.zeros(trials)
    
    for i in trange(trials):
        unfrozen_graph = nx.Graph(graph)
        randomized_graph = nx.double_edge_swap(unfrozen_graph, edge_swaps, edge_swaps * 5)
        modularities[i] = modularity(randomized_graph, partition)
    
    mu, sigma = modularities.mean(), modularities.std()
    
    section_modularity = stats.modularity(graph, partition)
    louvain_modularity = stats.modularity(graph, louvain)
    
    x = np.linspace(modularities.min()-3*sigma, modularities.max()+3*sigma, 2000)
    
    plt.hist(modularities, bins=10, rwidth=0.9, density=True, label="Randomized modularities", color="lightblue")
    plt.plot(x, norm.pdf(x, loc=mu, scale=sigma))
    plt.axvline(section_modularity, color="red", label="Section modularity")
    plt.axvline(louvain_modularity, color="darkgreen", label="Louvain modularity")
    plt.legend()
    plt.xlabel("Modularity")
    plt.title(f"Modularity experiment of {graph.name}")
    plt.savefig(f"images/{graph.name.replace(' ', '_').lower()}_mod.png")
    plt.show()
    
    print(f"Mean of randomized modularities: {mu:.3f}, standard deviation of randomized modularities: {sigma:.3f}")
    print(f"Section partition modularity: {section_modularity:.3f}")
    print(f"Louvain partition modularity: {louvain_modularity:.3f}")
    
randomization_experiment(reuters_gcc, 20_000, nx.get_node_attributes(reuters_gcc, "section"), 500)
randomization_experiment(nyt_gcc, 15_000, nx.get_node_attributes(nyt_gcc, "section"), 500)

Calculating modularities of randomized versions of Reuters graph

  0%|          | 0/500 [00:00<?, ?it/s]

Mean of randomized modularities: 0.019, standard deviation of randomized modularities: 0.003
Section partition modularity: 0.233
Louvain partition modularity: 0.533
Calculating modularities of randomized versions of The New York Times graph

  0%|          | 0/500 [00:00<?, ?it/s]

Mean of randomized modularities: 0.037, standard deviation of randomized modularities: 0.003
Section partition modularity: 0.494
Louvain partition modularity: 0.524


sentiment_pipeline = pipeline(model="distilbert-base-uncased-finetuned-sst-2-english", revision="af0f99b", truncation=True,device=0)

nyt_df = pd.read_json("data/processed/nyt.json")
reuters_df = pd.read_json("data/processed/reuters.json")

nyt_articles = nyt_df.drop(columns = ["date", "title", "publication"])
reuters_articles = reuters_df.drop(columns = ["date", "title", "publication"])

Downloading:   0%|          | 0.00/18.0k [00:00<?, ?B/s]


nyt_sections = {}
for section in nyt_df["section"].unique():
    nyt_sections[section] = len(nyt_df[nyt_df['section'] == section])
nyt_sections = sorted(nyt_sections, key=nyt_sections.get, reverse=True)

reuters_sections = {}
for section in reuters_df["section"].unique():
    reuters_sections[section] = len(reuters_df[reuters_df['section'] == section])
reuters_sections = sorted(reuters_sections, key=reuters_sections.get, reverse=True)

class Tokenizer:
    stopwords = wordcloud.STOPWORDS
    stopwords.update(["mr","dr", "ms", "said"])
    pattern = re.compile("[^a-z ]+")

    def tokenize(self, text: str) -> list[str]:
        text = text.lower()
        text = self.pattern.sub("", text)
        words = text.split()
        words = [word for word in words if word not in self.stopwords]
        return words


nyt_section_articles = [[] for _ in range(len(nyt_sections))]
reuters_section_articles = [[] for _ in range(len(reuters_sections))]

tokenizer = Tokenizer()
for i, row in nyt_articles.iterrows():
    section = row["section"]
    article = row["article"]
    if not (article is np.nan):
        tokens = tokenizer.tokenize(article)
        nyt_section_articles[nyt_sections.index(section)].extend(tokens)

for i, row in reuters_articles.iterrows():
    section = row["section"]
    article = row["article"]
    if not (pd.isnull(article)):
        tokens = tokenizer.tokenize(article)
        reuters_section_articles[reuters_sections.index(section)].extend(tokens)


nyt_top5_terms = []
print("-----New York Times-----")
for i, section in enumerate(nyt_sections):
    print(f"Top 5 terms for {section}")
    counts = nltk.FreqDist(nyt_section_articles[i])
    top5 = counts.most_common(5)
    nyt_top5_terms.append([term for term, _ in top5])
    print(top5)
    print("")

-----New York Times-----
Top 5 terms for us
[('trump', 59005), ('president', 34654), ('house', 23602), ('people', 23415), ('one', 23156)]

Top 5 terms for world
[('government', 15437), ('united', 14673), ('people', 14090), ('one', 13869), ('states', 12109)]

Top 5 terms for business
[('company', 8178), ('new', 7115), ('will', 6521), ('one', 5721), ('trump', 5639)]

Top 5 terms for nyregion
[('new', 11770), ('york', 7503), ('city', 6374), ('police', 5638), ('one', 4914)]

Top 5 terms for briefing
[('new', 4928), ('briefing', 4610), ('us', 4574), ('trump', 4306), ('president', 4215)]

Top 5 terms for movies
[('movie', 2767), ('one', 2187), ('film', 2118), ('movies', 1300), ('time', 1262)]

Top 5 terms for arts
[('new', 3930), ('one', 2706), ('music', 1924), ('art', 1858), ('album', 1738)]

Top 5 terms for technology
[('facebook', 3901), ('company', 3830), ('companies', 2517), ('new', 2477), ('people', 2177)]

Top 5 terms for sports
[('game', 2815), ('first', 2346), ('one', 2310), ('will', 1925), ('team', 1914)]

Top 5 terms for well
[('children', 1572), ('parents', 1033), ('child', 798), ('one', 668), ('may', 665)]

Top 5 terms for climate
[('climate', 2315), ('change', 1113), ('new', 854), ('states', 775), ('trump', 768)]

Top 5 terms for nytnow
[('briefing', 915), ('us', 694), ('new', 670), ('want', 649), ('one', 618)]

Top 5 terms for health
[('health', 1704), ('people', 1113), ('new', 876), ('drug', 867), ('care', 842)]

Top 5 terms for opinion
[('gail', 1425), ('trump', 1423), ('will', 967), ('bret', 930), ('think', 881)]

Top 5 terms for realestate
[('slide', 360), ('show', 360), ('weeks', 270), ('properties', 266), ('house', 253)]

Top 5 terms for style
[('people', 635), ('one', 613), ('will', 448), ('new', 447), ('time', 383)]

Top 5 terms for upshot
[('health', 730), ('people', 643), ('trump', 619), ('states', 514), ('new', 492)]

Top 5 terms for espanol
[('de', 11866), ('la', 5582), ('que', 5536), ('en', 5206), ('el', 4322)]

Top 5 terms for magazine
[('patient', 550), ('one', 437), ('blood', 338), ('doctor', 331), ('back', 307)]

Top 5 terms for theater
[('broadway', 452), ('theater', 426), ('play', 419), ('new', 353), ('show', 346)]

Top 5 terms for learning
[('students', 675), ('times', 319), ('one', 295), ('think', 276), ('will', 255)]


reuters_top5_terms = []
print("-----Reuters-----")
for i, section in enumerate(reuters_sections):
    print(f"Top 5 terms for {section}")
    counts = nltk.FreqDist(reuters_section_articles[i])
    top5 = counts.most_common(5)
    reuters_top5_terms.append([term for term, _ in top5])
    print(top5)
    print("")

-----Reuters-----
Top 5 terms for Business News
[('us', 41118), ('will', 36109), ('percent', 35902), ('year', 34127), ('billion', 32223)]

Top 5 terms for World News
[('us', 35603), ('will', 30037), ('government', 26948), ('reuters', 26774), ('people', 21721)]

Top 5 terms for Politics
[('trump', 19439), ('us', 12234), ('house', 8719), ('president', 8172), ('republican', 6170)]

Top 5 terms for Deals
[('billion', 7179), ('deal', 4801), ('percent', 4798), ('company', 4627), ('reuters', 4599)]

Top 5 terms for Commodities
[('oil', 9091), ('year', 4722), ('will', 4443), ('reuters', 4055), ('million', 4004)]

Top 5 terms for Technology News
[('company', 3116), ('will', 2983), ('reuters', 2928), ('billion', 2867), ('year', 2404)]

Top 5 terms for Intel
[('percent', 2770), ('reuters', 2576), ('will', 2363), ('year', 1963), ('new', 1785)]

Top 5 terms for Market News
[('us', 2895), ('new', 2245), ('will', 1927), ('reuters', 1703), ('sp', 1526)]

Top 5 terms for Bonds News
[('us', 2690), ('year', 2516), ('market', 2225), ('will', 2135), ('bank', 2026)]

Top 5 terms for Environment
[('will', 1962), ('reuters', 1790), ('new', 1610), ('us', 1541), ('oil', 1491)]

Top 5 terms for U.S. Legal News
[('us', 3092), ('new', 1539), ('will', 1507), ('reuters', 1491), ('trump', 1409)]

Top 5 terms for Japan
[('japan', 2166), ('will', 1629), ('billion', 1560), ('japanese', 1552), ('reuters', 1380)]

Top 5 terms for Brexit
[('brexit', 5292), ('eu', 5161), ('deal', 3953), ('will', 2870), ('britain', 2843)]

Top 5 terms for Davos
[('us', 2398), ('percent', 2149), ('year', 1788), ('trade', 1721), ('will', 1578)]

Top 5 terms for Health News
[('health', 1753), ('us', 1386), ('new', 1350), ('reuters', 1279), ('drug', 1240)]

Top 5 terms for Asia
[('reuters', 1320), ('year', 1187), ('will', 1162), ('china', 1073), ('government', 925)]

Top 5 terms for U.S.
[('us', 2178), ('new', 1324), ('reuters', 1060), ('people', 1053), ('york', 846)]

Top 5 terms for Financials
[('bank', 1405), ('banks', 921), ('will', 904), ('reuters', 822), ('year', 770)]

Top 5 terms for Wealth
[('percent', 1506), ('billion', 1191), ('bank', 1188), ('us', 1150), ('fund', 1081)]

Top 5 terms for Company News
[('us', 1489), ('will', 1073), ('reuters', 1042), ('new', 824), ('year', 809)]

Top 5 terms for Hot Stocks
[('ftse', 909), ('percent', 777), ('shares', 758), ('index', 600), ('year', 554)]

Top 5 terms for Entertainment News
[('film', 714), ('new', 684), ('will', 652), ('reuters', 590), ('weinstein', 586)]

Top 5 terms for Sports News
[('games', 868), ('olympic', 697), ('will', 652), ('reuters', 598), ('world', 573)]

Top 5 terms for Credit RSS
[('us', 710), ('reuters', 698), ('will', 575), ('year', 457), ('new', 437)]

Top 5 terms for Sustainable Business
[('will', 771), ('climate', 761), ('reuters', 663), ('companies', 638), ('new', 625)]

Top 5 terms for Funds News
[('reuters', 570), ('us', 545), ('new', 524), ('billion', 522), ('will', 499)]

Top 5 terms for Foreign Exchange Analysis
[('sterling', 1011), ('brexit', 853), ('pound', 764), ('currency', 619), ('percent', 603)]

Top 5 terms for Fintech
[('bitcoin', 913), ('reuters', 698), ('cryptocurrency', 684), ('new', 628), ('will', 584)]

Top 5 terms for Healthcare
[('people', 642), ('coronavirus', 619), ('reuters', 474), ('will', 431), ('health', 394)]

Top 5 terms for Cyber Risk
[('us', 827), ('security', 823), ('cyber', 737), ('reuters', 536), ('data', 535)]

Top 5 terms for Cyclical Consumer Goods
[('reuters', 455), ('will', 385), ('us', 301), ('people', 298), ('new', 272)]

Top 5 terms for Energy
[('oil', 932), ('us', 570), ('will', 389), ('reuters', 373), ('crude', 355)]

Top 5 terms for Basic Materials
[('gold', 559), ('week', 376), ('prices', 358), ('demand', 346), ('reuters', 342)]

Top 5 terms for Big Story 10
[('reuters', 429), ('year', 283), ('will', 275), ('people', 271), ('thomson', 217)]

Top 5 terms for Special Reports
[('reuters', 1572), ('one', 1036), ('us', 931), ('according', 918), ('police', 884)]

Top 5 terms for Supreme Court
[('court', 1209), ('trump', 656), ('supreme', 569), ('kavanaugh', 568), ('senate', 563)]

Top 5 terms for Breakingviews
[('breakingviews', 455), ('reuters', 434), ('financial', 320), ('new', 290), ('york', 235)]

Top 5 terms for Lifestyle
[('reuters', 185), ('percent', 183), ('year', 174), ('last', 142), ('will', 139)]

Top 5 terms for Corrections News
[('us', 218), ('will', 178), ('reuters', 166), ('new', 146), ('oil', 118)]


stopwords = wordcloud.STOPWORDS
stopwords.update(["mr","dr", "ms", "said"])
pattern = re.compile("[^a-z ]+")
cloud = WordCloud(background_color="white", width=1920, height=1080)

def tokenize(text: str) -> list[str]:
    text = text.lower()
    text = pattern.sub("", text)
    words = text.split()
    words = [word for word in words if word not in stopwords]
    return words


for sec in nyt_sections:
    nyt_section_articles = nyt_articles[nyt_articles["section"] == sec]["article"].tolist()
    nyt_tfidfs_section_vec = TfidfVectorizer(tokenizer=tokenize)
    nyt_tfidfs_section_vec.fit(nyt_section_articles)
    nyt_tfidfs_section = nyt_tfidfs_section_vec.transform(nyt_section_articles).toarray()
    nyt_tfidfs_section = pd.Series(nyt_tfidfs_section.mean(axis=0), index=nyt_tfidfs_section_vec.get_feature_names_out()).sort_values(ascending=False).to_dict()
    CLOUD = cloud.generate_from_frequencies(nyt_tfidfs_section)
    plt.imshow(CLOUD)
    plt.axis("off")
    plt.title(f"NYT TF-IDF {sec}")
    plt.savefig(f"images/NYT_TFIDF_{sec}.png", bbox_inches="tight", dpi=300)
    #plt.show()

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(


for sec in reuters_sections:
    reuters_section_articles = reuters_articles[reuters_articles["section"] == sec]["article"].tolist()
    reuters_tfidfs_section_vec = TfidfVectorizer(tokenizer=tokenize)
    reuters_tfidfs_section_vec.fit(reuters_section_articles)
    reuters_tfidfs_section = reuters_tfidfs_section_vec.transform(reuters_section_articles).toarray()
    reuters_tfidfs_section = pd.Series(reuters_tfidfs_section.mean(axis=0), index=reuters_tfidfs_section_vec.get_feature_names_out()).sort_values(ascending=False).to_dict()
    CLOUD = cloud.generate_from_frequencies(reuters_tfidfs_section)
    plt.imshow(CLOUD)
    plt.axis("off")
    plt.title(f"Reuters TF-IDF {sec}")
    plt.savefig(f"images/Reuters_TFIDF_{sec}.png", bbox_inches="tight", dpi=300)
    #plt.show()

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(


nyt_all_articles = "".join(nyt_articles["article"])
reuters_all_articles = "".join(reuters_articles["article"])

# TF-IDF on all articles
nyt_tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize)
nyt_tfidf_vectorizer.fit([nyt_all_articles])
nyt_tfidf = nyt_tfidf_vectorizer.transform([nyt_all_articles])
nyt_tfidf = nyt_tfidf.toarray()[0]
nyt_tfidf = pd.Series(nyt_tfidf, index=nyt_tfidf_vectorizer.get_feature_names_out())
nyt_tfidf = nyt_tfidf.sort_values(ascending=False)
nyt_tfidf = nyt_tfidf.to_dict()

reuters_tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize)
reuters_tfidf_vectorizer.fit([reuters_all_articles])
reuters_tfidf = reuters_tfidf_vectorizer.transform([reuters_all_articles])
reuters_tfidf = reuters_tfidf.toarray()[0]
reuters_tfidf = pd.Series(reuters_tfidf, index=reuters_tfidf_vectorizer.get_feature_names_out())
reuters_tfidf = reuters_tfidf.sort_values(ascending=False)
reuters_tfidf = reuters_tfidf.to_dict()

CLOUD = cloud.generate_from_frequencies(nyt_tfidf)
plt.imshow(CLOUD)
plt.axis("off")
plt.title(f"NYT TF-IDF")
plt.savefig(f"images/NYT_TFIDF.png", bbox_inches="tight", dpi=300)
plt.show()

CLOUD = cloud.generate_from_frequencies(reuters_tfidf)
plt.imshow(CLOUD)
plt.axis("off")
plt.title(f"Reuters TF-IDF")
plt.savefig(f"images/Reuters_TFIDF.png", bbox_inches="tight", dpi=300)
plt.show()

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(


# reset index of dataframe
nyt_articles = nyt_articles.reset_index(drop=True)
reuters_articles = reuters_articles.reset_index(drop=True)


shortArticles = []
for article in nyt_articles["article"]:
    articleShort = " ".join(article.split(" ")[:512])
    shortArticles.append(articleShort)
sentiments = sentiment_pipeline(shortArticles, padding=True, truncation=True)
nyt_articles["Sentiment"] = ""
nyt_articles["Sen Score"] = ""
for i in range(len(sentiments)):
    sentiment = sentiments[i]
    nyt_articles["Sentiment"][i] = sentiment["label"]
    nyt_articles["Sen Score"][i] = sentiment["score"]
nyt_articles.to_json("data/processed/nyt_language_sentiment.json")


shortArticles = []
for article in reuters_articles["article"]:
    articleShort = " ".join(article.split(" ")[:512])
    shortArticles.append(articleShort)
sentiments = sentiment_pipeline(shortArticles, padding=True, truncation=True)
reuters_articles["Sentiment"] = ""
reuters_articles["Sen Score"] = ""
for i in range(len(sentiments)):
    sentiment = sentiments[i]
    reuters_articles["Sentiment"][i] = sentiment["label"]
    reuters_articles["Sen Score"][i] = sentiment["score"]
reuters_articles.to_json("data/processed/reuters_language_sentiment.json")


nyt_articles = pd.read_json("data/processed/nyt_language_sentiment.json")
reuters_articles = pd.read_json("data/processed/reuters_language_sentiment.json")

# all articles with sen score less than 0.9 are considered neutral
nyt_articles["Sen Score"] = nyt_articles["Sen Score"].astype(float)
nyt_articles["Sentiment"] = nyt_articles["Sentiment"].astype(str)
nyt_articles["Sentiment"][nyt_articles["Sen Score"] < 0.9] = "NEUTRAL"
nyt_articles["Sentiment"] = nyt_articles["Sentiment"].astype("category")

reuters_articles["Sen Score"] = reuters_articles["Sen Score"].astype(float)
reuters_articles["Sentiment"] = reuters_articles["Sentiment"].astype(str)
reuters_articles["Sentiment"][reuters_articles["Sen Score"] < 0.9] = "NEUTRAL"
reuters_articles["Sentiment"] = reuters_articles["Sentiment"].astype("category")

C:\Users\KrisK\AppData\Local\Temp\ipykernel_25084\3412638928.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nyt_articles["Sentiment"][nyt_articles["Sen Score"] < 0.9] = "NEUTRAL"
C:\Users\KrisK\AppData\Local\Temp\ipykernel_25084\3412638928.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reuters_articles["Sentiment"][reuters_articles["Sen Score"] < 0.9] = "NEUTRAL"


reuters_sentiments_score = np.asarray(reuters_articles["Sen Score"].tolist())
reuters_sentiment_avg_score = reuters_sentiments_score.mean()
reuters_sentiment_max = reuters_sentiments_score.max()
reuters_sentiment_min = reuters_sentiments_score.min()

print(f"Reuters Sentiment Average Score: {reuters_sentiment_avg_score}")
print(f"Reuters Sentiment Max: {reuters_sentiment_max}")
print(f"Reuters Sentiment Min: {reuters_sentiment_min}")
print("")

reuters_sentiments = reuters_articles["Sentiment"]
print(f"Reuters Sentiment Distribution: {reuters_sentiments.value_counts()}")

print("\n")

nyt_sentiments_score = np.asarray(nyt_articles["Sen Score"].tolist())
nyt_sentiment_avg_score = nyt_sentiments_score.mean()
nyt_sentiment_max = nyt_sentiments_score.max()
nyt_sentiment_min = nyt_sentiments_score.min()

print(f"NYT Sentiment Average Score: {nyt_sentiment_avg_score}")
print(f"NYT Sentiment Max: {nyt_sentiment_max}")
print(f"NYT Sentiment Min: {nyt_sentiment_min}")
print("")
nyt_sentiments = nyt_articles["Sentiment"]
print(f"NYT Sentiment Distribution: {nyt_sentiments.value_counts()}")

Reuters Sentiment Average Score: 0.9631492433844178
Reuters Sentiment Max: 0.9997518659000001
Reuters Sentiment Min: 0.5003492236

Reuters Sentiment Distribution: Sentiment
NEGATIVE    47651
NEUTRAL      5812
POSITIVE     3666
Name: count, dtype: int64


NYT Sentiment Average Score: 0.950676645385258
NYT Sentiment Max: 0.9997653365
NYT Sentiment Min: 0.5000764728

NYT Sentiment Distribution: Sentiment
NEGATIVE    17376
NEUTRAL      3384
POSITIVE     3036
Name: count, dtype: int64


shortArticles = []
# all articles in section US from NYT
for article in nyt_articles[nyt_articles["section"] == "us"]["article"]:
    articleShort = " ".join(article.split(" ")[:512])
    shortArticles.append(articleShort)
sentiments = sentiment_pipeline(shortArticles, padding=True, truncation=True)
# move the sentiments to np.array
nyt_score_us = np.asarray([sentiment["score"] for sentiment in sentiments])
nyt_label_us = np.asarray([sentiment["label"] for sentiment in sentiments])

# every score less than 0.9 is considered neutral
nyt_label_us[nyt_score_us < 0.9] = "NEUTRAL"
nyt_label_us = pd.DataFrame(nyt_label_us)

shortArticles = []
# all articles in section Politics from Reuters
for article in reuters_articles[reuters_articles["section"] == "Politics"]["article"]:
    articleShort = " ".join(article.split(" ")[:512])
    shortArticles.append(articleShort)
sentiments = sentiment_pipeline(shortArticles, padding=True, truncation=True)
# move the sentiments to np.array
reuters_score_poli = np.asarray([sentiment["score"] for sentiment in sentiments])
reuters_label_poli = pd.DataFrame(np.asarray([sentiment["label"] for sentiment in sentiments]))

# every score less than 0.9 is considered neutral
reuters_label_poli[reuters_score_poli < 0.9] = "NEUTRAL"
reuters_label_poli = pd.DataFrame(reuters_label_poli)


print(f"NYT_US Sentiment Average Score: {nyt_score_us.mean()}")
print(f"NYT_US Sentiment Max: {nyt_score_us.max()}")
print(f"NYT_US Sentiment Min: {nyt_score_us.min()}")
print("")
print(f"NYT_US Sentiment Distribution: \n{nyt_label_us.value_counts()}")

print("\n")

print(f"Reuters_Politics Sentiment Average Score: {reuters_score_poli.mean()}")
print(f"Reuters_Politics Sentiment Max: {reuters_score_poli.max()}")
print(f"Reuters_Politics Sentiment Min: {reuters_score_poli.min()}")
print("")
print(f"Reuters_Politics Sentiment Distribution: \n{reuters_label_poli.value_counts()}")

NYT_US Sentiment Average Score: 0.9508135075220069
NYT_US Sentiment Max: 0.9997653365135193
NYT_US Sentiment Min: 0.5001206398010254

NYT_US Sentiment Distribution: 
NEGATIVE    5938
NEUTRAL     1118
POSITIVE     771
Name: count, dtype: int64


Reuters_Politics Sentiment Average Score: 0.9583817098263969
Reuters_Politics Sentiment Max: 0.9996844530105591
Reuters_Politics Sentiment Min: 0.5024705529212952

Reuters_Politics Sentiment Distribution: 
NEGATIVE    2472
NEUTRAL      353
POSITIVE     212
Name: count, dtype: int64


nyt_df = pd.read_json("data/processed/nyt.json")
reuters_df = pd.read_json("data/processed/reuters.json")


nyt_us = nyt_df[nyt_df["section"] == "us"]
reuters_politics = reuters_df[reuters_df["section"] == "Politics"]


classifier = transformers.pipeline(model="distilbert-base-uncased-finetuned-sst-2-english", truncation=True, device=0)

Downloading:   0%|          | 0.00/18.0k [00:00<?, ?B/s]


def get_sentiments(df):
    sentiments = []
    for i, row in tqdm.tqdm(df.iterrows()):
        s = classifier(row["article"])[0]
        if s["score"] < 0.9:
            s["label"] = "NEUTRAL"
        sentiments.append(s)
    return sentiments


nyt_sentiments = get_sentiments(nyt_us)

10it [00:00, 45.46it/s]C:\ProgramData\Anaconda3\lib\site-packages\transformers\pipelines\base.py:1043: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
  warnings.warn(
7827it [01:23, 94.15it/s]


reuters_sentiments = get_sentiments(reuters_politics)

0it [00:00, ?it/s]C:\ProgramData\Anaconda3\lib\site-packages\transformers\pipelines\base.py:1043: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
  warnings.warn(
3037it [00:25, 117.63it/s]


with open("nyt_sentiments.pkl", "wb") as f:
    pickle.dump(nyt_sentiments, f)


with open("reuters_sentiments.pkl", "wb") as f:
    pickle.dump(reuters_sentiments, f)


sentiment_map = {"NEGATIVE": 0, "NEUTRAL": 1, "POSITIVE": 2}


def get_author_sentiments(df, sentiments):
    authors = defaultdict(lambda: [0, 0, 0])
    c = 0
    for i, row in df.iterrows():
        s = sentiment_map[sentiments[c]["label"]]
        c += 1
        for author in row["authors"]:
            authors[author][s] += 1
    return authors


nyt_author_sentiments = get_author_sentiments(nyt_us, nyt_sentiments)
reuters_author_sentiments = get_author_sentiments(reuters_politics, reuters_sentiments)


def plot_major_sentiment(author_sentiments):
    sentiment_counts = [0, 0, 0]
    for sentiments in author_sentiments.values():
        sentiment_counts[np.argmax(sentiments)] += 1
    fig, ax = plt.subplots()

    bars = ax.bar(range(3), sentiment_counts)
    ax.set_xticks(range(3), ["Negative", "Neutral", "Positive"])
    ax.bar_label(bars)


plot_major_sentiment(nyt_author_sentiments)
plt.title("NYT most frequent author sentiment")
plt.show()


plot_major_sentiment(reuters_author_sentiments)
plt.title("Reuters most frequent author sentiment")
plt.show()

Project Github ¶

Project Website ¶

1. Motivation¶

2. Preprocessing and basic statistics¶

Preprocessing¶

Statistics¶

3. Tools, Theory, and analysis¶

Graph construction and statistics¶

Text analysis¶

Sentiment Analysis¶

Sentiment Analysis Results¶

Author sentiment analysis¶

Discussion¶

	date	article	title	url	section	publication	authors
0	2019-06-23 00:00:00	(Reuters) - The success of Hudson’s Bay Co Exe...	Hudson's Bay's chairman's buyout bid pits reta...	https://www.reuters.com/article/hudsons-bay-ma...	Business News	Reuters	[Harry Brumpton, Jessica Dinapoli]
1	2019-02-05 00:00:00	LONDON (Reuters) - Britain’s financial service...	Exclusive: Britain's financial heartland unbow...	https://www.reuters.com/article/us-britain-eu-...	Business News	Reuters	[Andrew Macaskill, Simon Jessop]

	date	article	title	url	section	publication	authors
0	2016-01-13 22:02:33	Last Friday a group of 15 cancer researchers c...	‘Moonshot’ to Cure Cancer, to Be Led by Biden,...	http://www.nytimes.com/2016/01/14/health/moons...	health	The New York Times	[Gardiner Harris, Gina Kolata]
1	2016-01-17 21:20:09	WASHINGTON — For a year, Obama administration ...	14 Testy Months Behind U.S. Prisoner Swap With...	http://www.nytimes.com/2016/01/18/us/politics/...	us	The New York Times	[David E. Sanger, Peter Baker]

Project Github¶

Project Website¶

1. Motivation¶

2. Preprocessing and basic statistics¶

Preprocessing¶

Statistics¶

3. Tools, Theory, and analysis¶

Graph construction and statistics¶

Text analysis¶

Sentiment Analysis¶

Sentiment Analysis Results¶

Author sentiment analysis¶

Discussion¶

Project Github ¶

Project Website ¶