Skip to content

Data Processing - Sequences to Sets and Back

CS 5001/5002 - Strings, Sequences & Sets

Code

#!/usr/bin/env python3
"""
Filename: data_processing.py
Description: Data Processing - Sequences to Sets and Back
CS 5001/5002 - Strings, Sequences & Sets

This script demonstrates practical data processing using sequences and sets
for enrollment analysis and deduplication.
"""

def process_student_data():
    """Demonstrate data processing using sequences and sets"""

    # Student enrollment data (sequences with possible duplicates)
    cs5001_students = ["Alice", "Bob", "Charlie", "Diana", "Alice", "Eve"]
    cs5002_students = ["Bob", "Charlie", "Frank", "Grace", "Alice"]

    print("Original enrollment lists (sequences):")
    print(f"CS 5001: {cs5001_students}")
    print(f"CS 5002: {cs5002_students}")

    # Convert to sets for analysis
    cs5001_set = set(cs5001_students)
    cs5002_set = set(cs5002_students)

    print(f"\nUnique students per class (sets):")
    print(f"CS 5001: {cs5001_set}")
    print(f"CS 5002: {cs5002_set}")

    # Set operations for analysis
    both_classes = cs5001_set & cs5002_set
    only_5001 = cs5001_set - cs5002_set
    only_5002 = cs5002_set - cs5001_set
    all_students = cs5001_set | cs5002_set

    print(f"\nEnrollment analysis:")
    print(f"Taking both classes: {both_classes}")
    print(f"Only CS 5001: {only_5001}")
    print(f"Only CS 5002: {only_5002}")
    print(f"All students: {all_students}")

    # Statistics
    print(f"\nStatistics:")
    print(f"Total unique students: {len(all_students)}")
    print(f"Students in both classes: {len(both_classes)}")
    print(f"Percentage taking both: {len(both_classes)/len(all_students)*100:.1f}%")

    # Convert back to sorted lists for reporting
    print(f"\nSorted lists for reports:")
    print(f"All students (alphabetical): {sorted(all_students)}")
    print(f"Both classes (alphabetical): {sorted(both_classes)}")

def main():
    process_student_data()

if __name__ == "__main__":
    main()

How to Use

  1. Copy the code above
  2. Save it as a .py file (e.g., data_processing.py)
  3. Run it with: python data_processing.py

Part of CS 5001/5002 - Strings, Sequences & Sets materials