Topic Modeling with Top2Vec: Dreyfus, AI, and Wordclouds

Claudio Ctin3 months ago13 mins

Extracting Insights from PDFs with Python: A Comprehensive Guide

This script demonstrates a powerful workflow for processing PDFs, extracting text, tokenizing sentences, and performing topic modeling with visualization, tailored for efficient and insightful analysis.

Libraries Overview

os: Provides functions to interact with the operating system.

matplotlib.pyplot: Used for creating static, animated, and interactive visualizations in Python.

nltk: Natural Language Toolkit, a suite of libraries and programs for natural language processing.

pandas: Data manipulation and analysis library.

pdftotext: Library for converting PDF documents to plain text.

re: Provides regular expression matching operations.

seaborn: Statistical data visualization library based on matplotlib.

nltk.tokenize.sent_tokenize: NLTK function to tokenize a string into sentences.

top2vec: Library for topic modeling and semantic search.

wordcloud: Library for creating word clouds from text data.

Initial Setup

Import Modules

 import os
 import matplotlib.pyplot as plt
 import nltk
 import pandas as pd
 import pdftotext
 import re
 import seaborn as sns
 from nltk.tokenize import sent_tokenize
 from top2vec import Top2Vec
 from wordcloud import WordCloud
 from cleantext import clean

Next, ensure the punkt tokenizer is downloaded:

 nltk.download(‘punkt‘)

Text Normalization

def normalize_text(text):
“””Normalize text by removing special characters and extra spaces,
and applying various other cleaning options.“””

# Apply the clean function with specified parameters
cleaned_text = clean(
text,
fix_unicode=True, # fix various unicode errors
to_ascii=True, # transliterate to closest ASCII representation
lower=True, # lowercase text
no_line_breaks=False, # fully strip line breaks as opposed to only normalizing them
no_urls=True, # replace all URLs with a special token
no_emails=True, # replace all email addresses with a special token
no_phone_numbers=True, # replace all phone numbers with a special token
no_numbers=True, # replace all numbers with a special token
no_digits=True, # replace all digits with a special token
no_currency_symbols=True, # replace all currency symbols with a special token
no_punct=False, # remove punctuations
lang=“en“, # set to ‘de’ for German special handling
)

# Further clean the text by removing any remaining special characters except word characters, whitespace, and periods/commas
cleaned_text = re.sub(r“[^ws.,]“, “”, cleaned_text)
# Replace multiple whitespace characters with a single space and strip leading/trailing spaces
cleaned_text = re.sub(r“s+“, “ “, cleaned_text).strip()

return cleaned_text

PDF Text Extraction

 def extract_text_from_pdf(pdf_path):
 with open(pdf_path, “rb“) as f:
 pdf = pdftotext.PDF(f)
 all_text = “nn“.join(pdf)
 return normalize_text(all_text)

Sentence Tokenization

 def split_into_sentences(text):
 return sent_tokenize(text)

Processing Multiple Files

def process_files(file_paths):
authors, titles, all_sentences = [], [], []
for file_path in file_paths:
file_name = os.path.basename(file_path)
parts = file_name.split(“ – “, 2)
if len(parts) != 3 or not file_name.endswith(“.pdf“):
print(f“Skipping file with incorrect format: {file_name}“)
continue

year, author, title = parts
author, title = author.strip(), title.replace(“.pdf“, “”).strip()

try:
text = extract_text_from_pdf(file_path)
except Exception as e:
print(f“Error extracting text from {file_name}: {e}“)
continue

sentences = split_into_sentences(text)
authors.append(author)
titles.append(title)
all_sentences.extend(sentences)
print(f“Number of sentences for {file_name}: {len(sentences)}“)

return authors, titles, all_sentences

Saving Data to CSV

def save_data_to_csv(authors, titles, file_paths, output_file):
texts = []
for fp in file_paths:
try:
text = extract_text_from_pdf(fp)
sentences = split_into_sentences(text)
texts.append(“ “.join(sentences))
except Exception as e:
print(f“Error processing file {fp}: {e}“)
texts.append(“”)

data = pd.DataFrame({
“Author“: authors,
“Title“: titles,
“Text“: texts
})
data.to_csv(output_file, index=False, quoting=1, encoding=‘utf-8‘)
print(f“Data has been written to {output_file}“)

Loading Stopwords

 def load_stopwords(filepath):
 with open(filepath, “r“) as f:
 stopwords = f.read().splitlines()
 additional_stopwords = [“able“, “according“, “act“, “actually“, “after“, “again“, “age“, “agree“, “al“, “all“, “already“, “also“, “am“, “among“, “an“, “and“, “another“, “any“, “appropriate“, “are“, “argue“, “as“, “at“, “avoid“, “based“, “basic“, “basis“, “be“, “been“, “begin“, “best“, “book“, “both“, “build“, “but“, “by“, “call“, “can“, “cant“, “case“, “cases“, “claim“, “claims“, “class“, “clear“, “clearly“, “cope“, “could“, “course“, “data“, “de“, “deal“, “dec“, “did“, “do“, “doesnt“, “done“, “dont“, “each“, “early“, “ed“, “either“, “end“, “etc“, “even“, “ever“, “every“, “far“, “feel“, “few“, “field“, “find“, “first“, “follow“, “follows“, “for“, “found“, “free“, “fri“, “fully“, “get“, “had“, “hand“, “has“, “have“, “he“, “help“, “her“, “here“, “him“, “his“, “how“, “however“, “httpsabout“, “ibid“, “if“, “im“, “in“, “is“, “it“, “its“, “jstor“, “june“, “large“, “lead“, “least“, “less“, “like“, “long“, “look“, “man“, “many“, “may“, “me“, “money“, “more“, “most“, “move“, “moves“, “my“, “neither“, “net“, “never“, “new“, “no“, “nor“, “not“, “notes“, “notion“, “now“, “of“, “on“, “once“, “one“, “ones“, “only“, “open“, “or“, “order“, “orgterms“, “other“, “our“, “out“, “own“, “paper“, “past“, “place“, “plan“, “play“, “point“, “pp“, “precisely“, “press“, “put“, “rather“, “real“, “require“, “right“, “risk“, “role“, “said“, “same“, “says“, “search“, “second“, “see“, “seem“, “seems“, “seen“, “sees“, “set“, “shall“, “she“, “should“, “show“, “shows“, “since“, “so“, “step“, “strange“, “style“, “such“, “suggests“, “talk“, “tell“, “tells“, “term“, “terms“, “than“, “that“, “the“, “their“, “them“, “then“, “there“, “therefore“, “these“, “they“, “this“, “those“, “three“, “thus“, “to“, “todes“, “together“, “too“, “tradition“, “trans“, “true“, “try“, “trying“, “turn“, “turns“, “two“, “up“, “us“, “use“, “used“, “uses“, “using“, “very“, “view“, “vol“, “was“, “way“, “ways“, “we“, “web“, “well“, “were“, “what“, “when“, “whether“, “which“, “who“, “why“, “with“, “within“, “works“, “would“, “years“, “york“, “you“, “your“, “suggests“, “without“]
 stopwords.extend(additional_stopwords)
 return set(stopwords)

Filtering Stopwords from Topics

 def filter_stopwords_from_topics(topic_words, stopwords):
 filtered_topics = []
 for words in topic_words:
 filtered_topics.append([word for word in words if word.lower() not in stopwords])
 return filtered_topics

Word Cloud Generation

def generate_wordcloud(topic_words, topic_num, palette=‘inferno‘):
colors = sns.color_palette(palette, n_colors=256).as_hex()
def color_func(word, font_size, position, orientation, random_state=None, **kwargs):
return colors[random_state.randint(0, len(colors) – 1)]

wordcloud = WordCloud(width=800, height=400, background_color=‘black‘, color_func=color_func).generate(‘ ‘.join(topic_words))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation=‘bilinear‘)
plt.axis(‘off‘)
plt.title(f‘Topic {topic_num} Word Cloud‘)
plt.show()

Main Execution

file_paths = [f“/home/roomal/Desktop/Dreyfus-Project/Dreyfus/{fname}“ for fname in os.listdir(“/home/roomal/Desktop/Dreyfus-Project/Dreyfus/“) if fname.endswith(“.pdf“)]

authors, titles, all_sentences = process_files(file_paths)

output_file = “/home/roomal/Desktop/Dreyfus-Project/Dreyfus_Papers.csv“
save_data_to_csv(authors, titles, file_paths, output_file)

stopwords_filepath = “/home/roomal/Documents/Lists/stopwords.txt“
stopwords = load_stopwords(stopwords_filepath)

try:
topic_model = Top2Vec(
all_sentences,
embedding_model=“distiluse-base-multilingual-cased“,
speed=“deep-learn“,
workers=6
)
print(“Top2Vec model created successfully.“)
except ValueError as e:
print(f“Error initializing Top2Vec: {e}“)
except Exception as e:
print(f“Unexpected error: {e}“)

num_topics = topic_model.get_num_topics()
topic_words, word_scores, topic_nums = topic_model.get_topics(num_topics)
filtered_topic_words = filter_stopwords_from_topics(topic_words, stopwords)

for i, words in enumerate(filtered_topic_words):
print(f“Topic {i}: {‘, ‘.join(words)}“)

keywords = [“heidegger“]
topic_words, word_scores, topic_scores, topic_nums = topic_model.search_topics(keywords=keywords, num_topics=num_topics)
filtered

_search_topic_words = filter_stopwords_from_topics(topic_words, stopwords)

for i, words in enumerate(filtered_search_topic_words):
generate_wordcloud(words, topic_nums[i])

for i in range(reduced_num_topics):
topic_words = topic_model.topic_words_reduced[i]
filtered_words = [word for word in topic_words if word.lower() not in stopwords]
print(f“Reduced Topic {i}: {‘, ‘.join(filtered_words)}“)
generate_wordcloud(filtered_words, i)

Reduce the number of topics

reduced_num_topics = 5
topic_mapping = topic_model.hierarchical_topic_reduction(num_topics=reduced_num_topics)

# Print reduced topics and generate word clouds
for i in range(reduced_num_topics):
topic_words = topic_model.topic_words_reduced[i]
filtered_words = [word for word in topic_words if word.lower() not in stopwords]
print(f“Reduced Topic {i}: {‘, ‘.join(filtered_words)}“)
generate_wordcloud(filtered_words, i)

Please follow and like us: