-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathforward_algorithm.py
More file actions
128 lines (99 loc) · 4.48 KB
/
forward_algorithm.py
File metadata and controls
128 lines (99 loc) · 4.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import json
def read_json(fn):
"""Convert a json corpus to a dictionary
Args:
fn (str): Name of the json file to open
Returns:
dict: Converted json to dictionary consisting word counts
"""
with open(fn) as json_file:
data = json.load(json_file)
return data
def compute_start_probability(s_word, category):
"""Generate the start probability of the sentence
Args:
s_word (str): First word of the sentence
category (str): Lexical category of the word to check
Returns:
int: Starting probabilities
"""
start_prob = lex_prob(s_word, category) * \
compute_bigram_prob("empty", category)
show_results([(s_word, category, start_prob)])
return start_prob
def lex_prob(word, category):
"""Compute lexical generation probabilities from corpus
Args:
word (str): A word from the sentence
category (str): Lexical category of the word to compute
Returns:
int: Lexical generation probability of the given word
"""
return corpus["category_word_counts"][word][category] / \
corpus["category_word_counts"]["total"][category]
def compute_bigram_prob(pre_cat, current_cat):
"""Bigram probability from the sentence
Args:
pre_cat (str): Lexical category of the preceding word
current_cat (str): Lexical category of the curren word in the sentence
Returns:
int: Bigram probability of two words, occuring together
"""
if pre_cat not in corpus["word_pair_count"][current_cat]:
return 0
return corpus["word_pair_count"][current_cat][pre_cat] / corpus["total_counts"][pre_cat]
def normalize(word_probs, total_prob):
"""Normalize the probability of lexical category of the word in the sentence by dividing computed probabilities over the overall probability
Args:
word_probs (list): Probability of each possibile combination of lexical categories occuring after another
total_prob (int): Overall probability of the sequence at each full iteration
Returns:
int: Normalized probability of the word, i.e. the computed lexical category probabilities of the word
"""
probs_by_category = []
for i in word_probs:
probs_by_category.append((i[0], i[1], i[2] / total_prob))
return probs_by_category
def show_results(res):
for i in res:
print(f"{i[0]} ({i[1]}): {round(i[2], 3)}")
def est_forward(sentence, verbose=False):
"""Run forward algorithm. Thorough two iterations, it'll first go over each word then the word preceding it and then computes their probabilities.
Then it'll
Args:
sentence ([type]): [description]
verbose (bool, optional): [description]. Defaults to False.
"""
sequence_prob = 0
for word in sentence[1:]:
word_probs = []
lex_seq_probs = []
for i in corpus["category_word_counts"][word]:
seq_sum = 0
for j in corpus["category_word_counts"][sentence[sentence.index(word)-1]]:
# Compute probability of the sentence to begin with a lexical category. Will run only for the first word
s_prob = compute_start_probability(sentence[0], j) if sentence.index(
word) <= 1 else sequence_prob
lexical_gen_prob = lex_prob(
sentence[sentence.index(word)], i)
bigram_prob = compute_bigram_prob(j, i)
seq_prob = bigram_prob * s_prob * lexical_gen_prob
if verbose:
print("\n"
f"P({word}|{i} {sentence[sentence.index(word)-1]}|{j})=P({i}|{j})*P({' '.join(sentence[:sentence.index(word)])})*P({word}|{i})")
print(
f"{round(bigram_prob, 5)}*{round(seq_sum, 5)}*{round(lexical_gen_prob, 5)} = {sequence_prob}")
seq_sum += seq_prob
word_probs.append((word, i, seq_sum))
lex_seq_probs.append(seq_sum)
sequence_prob = sum(lex_seq_probs)
res = normalize(word_probs, sequence_prob)
show_results(res)
# We can calculate lexical probabilities for other words as well, if we expand our corpus enough to cover those words.
# Words that are not in the corpus would cause the program to crash!
s1 = "the flies like flower".split()
s2 = "the flies flower like".split()
# This will be used mostly for lexical generation probabilities
corpus = read_json("corpus.json")
# Check to True to see all the hidden calculations
est_forward(s1, False)