Natural Language Processing with Sets and Sequences¶
CS 5001/5002 - Strings, Sequences & Sets
Code¶
#!/usr/bin/env python3
"""
Filename: nlp_analysis.py
Description: Natural Language Processing with Sets and Sequences
CS 5001/5002 - Strings, Sequences & Sets
This script demonstrates natural language processing techniques using
sets and sequences for document analysis and text mining.
"""
def analyze_documents():
"""Analyze text documents using sets and sequences"""
documents = [
"Python is a powerful programming language for data science",
"Data science requires strong programming and mathematical skills",
"Mathematical foundations are essential for computer science"
]
print("Document Analysis:")
print("================")
# Process each document
doc_words = []
for i, doc in enumerate(documents):
# Convert to lowercase and split into words
words = doc.lower().replace(",", "").replace(".", "").split()
doc_words.append(set(words)) # Convert to set for analysis
print(f"Document {i+1}: '{doc}'")
print(f" Words: {sorted(words)}")
print(f" Unique words: {len(doc_words[i])}")
print()
# Set operations on documents
print("Cross-Document Analysis:")
print("=======================")
# Words in all documents (intersection)
common_words = doc_words[0]
for word_set in doc_words[1:]:
common_words &= word_set
print(f"Words in ALL documents: {sorted(common_words)}")
# Words in any document (union)
all_words = set()
for word_set in doc_words:
all_words |= word_set
print(f"ALL unique words: {sorted(all_words)}")
print(f"Total vocabulary size: {len(all_words)}")
# Words unique to each document
print(f"\nWords unique to each document:")
for i, word_set in enumerate(doc_words):
others = set()
for j, other_set in enumerate(doc_words):
if i != j:
others |= other_set
unique = word_set - others
print(f" Document {i+1} only: {sorted(unique)}")
# Find documents sharing specific words
target_words = {"programming", "science", "data"}
print(f"\nDocuments containing each target word:")
for word in target_words:
containing_docs = []
for i, word_set in enumerate(doc_words):
if word in word_set:
containing_docs.append(i + 1)
print(f" '{word}': Documents {containing_docs}")
def main():
analyze_documents()
if __name__ == "__main__":
main()
How to Use¶
- Copy the code above
- Save it as a
.pyfile (e.g.,nlp_analysis.py) - Run it with:
python nlp_analysis.py
Part of CS 5001/5002 - Strings, Sequences & Sets materials