Data-Analytics-Problems/problem6.py at main · Abudidayo/Data-Analytics-Problems · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
problem = "problem6"
student_name = "Abudi Alshamam"
student_numer = "N1212353"

import random
import os
import pandas as pd
from collections import Counter, defaultdict


def text_analyser(user_text):
    # Load the text file
    file_path = os.path.join("gutenberg20", user_text)
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()
    except FileNotFoundError:
        print(f"File {user_text} not found in directory 'gutenberg20'.")
        return

    # Preprocess the text: remove punctuation and convert to lowercase
    words = text.lower().split()
    words = [word.strip(".,!?;:\"'()[]{}") for word in words]

    # Build a graph of word relationships
    graph = defaultdict(set)
    for i in range(len(words) - 1):
        graph[words[i]].add(words[i + 1])

    # Create a Pandas DataFrame for analysis
    word_counts = Counter(words)
    df = pd.DataFrame({
        "Word": list(word_counts.keys()),
        "Frequency": list(word_counts.values()),
        "Unique Neighbours": [len(graph[word]) for word in word_counts.keys()]
    })

    while True:
        # Display options to the user
        print('''
Options:
1) Return a count of the number of distinct words in the text
2) Return the most frequent word in the text, along with its frequency
3) Return the word that has the largest number of unique neighbours
4) Return the word that has the smallest number of unique neighbours
5) Other descriptive statistics using Pandas DataFrames or Series
6) Find the shortest path between two words
7) Exit
8) Generate a random sentence using DFS from the word graph
''')

        # Get user input for the desired option
        option = input("Enter the number corresponding to your choice: ")

        # Find the number of distinct words in the text
        if option == "1":
            print(f"Number of distinct words: {len(word_counts)}")

        # Find the most frequent word in the text
        elif option == "2":
            most_frequent = df.loc[df["Frequency"].idxmax()]
            print(f"Most frequent word: '{most_frequent['Word']}' with frequency {most_frequent['Frequency']}")

        # Find the word with the largest number of unique neighbours
        elif option == "3":
            largest_neighbours = df.loc[df["Unique Neighbours"].idxmax()]
            print(f"Word with the largest number of unique neighbours: '{largest_neighbours['Word']}' ({largest_neighbours['Unique Neighbours']} neighbours)")

        # Find the word with the smallest number of unique neighbours
        elif option == "4":
            smallest_neighbours = df.loc[df["Unique Neighbours"].idxmin()]
            print(f"Word with the smallest number of unique neighbours: '{smallest_neighbours['Word']}' ({smallest_neighbours['Unique Neighbours']} neighbours)")

        # Display other descriptive statistics using Pandas
        elif option == "5":
            print("Descriptive statistics:")
            print(df.describe())

        # Find the shortest path between two words using BFS
        elif option == "6":
            word1 = input("Enter the first word: ").lower()
            word2 = input("Enter the second word: ").lower()

            if word1 not in graph or word2 not in graph:
                print("One or both words are not in the text.")
                continue

            queue = [(word1, [word1])]
            visited = set()

            while queue:
                current_word, path = queue.pop(0)
                if current_word == word2:
                    print(f"Shortest path: {' -> '.join(path)}")
                    break

                if current_word not in visited:
                    visited.add(current_word)
                    for neighbor in graph[current_word]:
                        queue.append((neighbor, path + [neighbor]))
            else:
                print("No such path exists.")
        elif option == "7":
            print("Exiting the program.")
            break
        elif option == "8":
            start_word = input("Enter a start word (or press Enter to choose randomly): ").lower()

            if not start_word:
                start_word = random.choice(list(graph.keys()))
                print(f"Randomly chosen start word: {start_word}")
            elif start_word not in graph:
                print("The chosen start word is not in the text.")
                continue

            # Generate a random sentence using DFS
            sentence = []
            visited = set()

            def dfs(word):
                if word in visited or len(sentence) >= 20:
                    return
                visited.add(word)
                sentence.append(word)
                if graph[word]:
                    next_word = random.choice(list(graph[word]))
                    dfs(next_word)

            dfs(start_word)
            print(f"Generated sentence: {' '.join(sentence)}")
        else:
            print("Invalid option. Please try again.")

text_analyser("RobinHood.txt")