Skip to content

Natural Language Processing with Sets and Sequences

CS 5001/5002 - Strings, Sequences & Sets

Code

#!/usr/bin/env python3
"""
Filename: nlp_analysis.py
Description: Natural Language Processing with Sets and Sequences
CS 5001/5002 - Strings, Sequences & Sets

This script demonstrates natural language processing techniques using
sets and sequences for document analysis and text mining.
"""

def analyze_documents():
    """Analyze text documents using sets and sequences"""

    documents = [
        "Python is a powerful programming language for data science",
        "Data science requires strong programming and mathematical skills",
        "Mathematical foundations are essential for computer science"
    ]

    print("Document Analysis:")
    print("================")

    # Process each document
    doc_words = []
    for i, doc in enumerate(documents):
        # Convert to lowercase and split into words
        words = doc.lower().replace(",", "").replace(".", "").split()
        doc_words.append(set(words))  # Convert to set for analysis

        print(f"Document {i+1}: '{doc}'")
        print(f"  Words: {sorted(words)}")
        print(f"  Unique words: {len(doc_words[i])}")
        print()

    # Set operations on documents
    print("Cross-Document Analysis:")
    print("=======================")

    # Words in all documents (intersection)
    common_words = doc_words[0]
    for word_set in doc_words[1:]:
        common_words &= word_set
    print(f"Words in ALL documents: {sorted(common_words)}")

    # Words in any document (union)
    all_words = set()
    for word_set in doc_words:
        all_words |= word_set
    print(f"ALL unique words: {sorted(all_words)}")
    print(f"Total vocabulary size: {len(all_words)}")

    # Words unique to each document
    print(f"\nWords unique to each document:")
    for i, word_set in enumerate(doc_words):
        others = set()
        for j, other_set in enumerate(doc_words):
            if i != j:
                others |= other_set
        unique = word_set - others
        print(f"  Document {i+1} only: {sorted(unique)}")

    # Find documents sharing specific words
    target_words = {"programming", "science", "data"}
    print(f"\nDocuments containing each target word:")
    for word in target_words:
        containing_docs = []
        for i, word_set in enumerate(doc_words):
            if word in word_set:
                containing_docs.append(i + 1)
        print(f"  '{word}': Documents {containing_docs}")

def main():
    analyze_documents()

if __name__ == "__main__":
    main()

How to Use

  1. Copy the code above
  2. Save it as a .py file (e.g., nlp_analysis.py)
  3. Run it with: python nlp_analysis.py

Part of CS 5001/5002 - Strings, Sequences & Sets materials