-
-
Notifications
You must be signed in to change notification settings - Fork 50.2k
Add tfidf #14406
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add tfidf #14406
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,97 @@ | ||
| import numpy as np | ||
|
Check failure on line 1 in machine_learning/feature_extraction/tf-idf.py
|
||
| import re | ||
|
Check failure on line 2 in machine_learning/feature_extraction/tf-idf.py
|
||
| # to seprate words and normlize it | ||
|
|
||
|
|
||
| def decompose(text): | ||
| text = text.lower() | ||
| text = re.sub(r"[^a-z0-9\s]", "", text) | ||
| text = re.sub(r"\s+", " ", text) | ||
|
|
||
| return text.split() | ||
|
|
||
|
|
||
| # creating tfidf class | ||
| class TfIdfVectorizer: | ||
| def __init__(self): | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please provide return type hint for the function: |
||
| self.vocab = None | ||
| self.idf = None | ||
|
|
||
| # these method to compute the tf for each word in given data | ||
| def compute_tf(self, data): | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As there is no test file in this pull request nor any test function or class in the file Please provide return type hint for the function: Please provide type hint for the parameter: |
||
| tf = [] | ||
| doc_words = [] | ||
|
|
||
| for document in data: | ||
| words = decompose(document) | ||
|
|
||
| freq = {} # these dictionerie have for each unique words it number of apprition in one sentence | ||
|
Check failure on line 28 in machine_learning/feature_extraction/tf-idf.py
|
||
|
|
||
| for word in words: | ||
| freq[word] = freq.get(word, 0) + 1 | ||
|
|
||
| if word not in doc_words: | ||
| doc_words.append(word) | ||
|
|
||
| # calculating tf | ||
|
|
||
| for word in freq: | ||
| freq[word] /= len(words) | ||
|
|
||
| tf.append(freq) | ||
|
|
||
| # computing idf | ||
| idf = {} | ||
|
|
||
| n = len(data) | ||
|
|
||
| for word in doc_words: | ||
| df = sum(1 for doc in tf if word in doc) | ||
| idf[word] = np.log((n + 1) / (1 + df)) + 1 | ||
|
|
||
| self.idf = idf | ||
| tfidf = [] | ||
|
|
||
| self.idf = idf | ||
|
|
||
| # computing tfidf for each word | ||
|
|
||
| for doc_tf in tf: | ||
| vector = [doc_tf.get(word, 0) * idf[word] for word in doc_words] | ||
| tfidf.append(vector) | ||
|
|
||
| self.vocab = doc_words | ||
|
|
||
| return np.array(tfidf, dtype=float) | ||
|
|
||
| def encode(self, data): | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As there is no test file in this pull request nor any test function or class in the file Please provide return type hint for the function: Please provide type hint for the parameter: |
||
| if self.vocab is None or self.idf is None: | ||
| raise ValueError("You should fit the model first") | ||
|
|
||
| tfidf_matrix = [] | ||
| for doc in data: | ||
| words = decompose(doc) | ||
| freq = {} | ||
|
|
||
| # Count term frequencies for words that exist in the vocabulary | ||
| for word in words: | ||
| if word in self.vocab: | ||
| freq[word] = freq.get(word, 0) + 1 | ||
|
|
||
| # Normalize TF by document length | ||
| for word in freq: | ||
| freq[word] /= len(words) | ||
|
|
||
| # Align vector according to vocab and multiply by IDF | ||
| vector = [freq.get(word, 0) * self.idf[word] for word in self.vocab] | ||
| tfidf_matrix.append(vector) | ||
|
|
||
| return np.array(tfidf_matrix, dtype=float) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| documents = ["the cat sat on the mat", "the dog chased the cat"] | ||
| vectorizer = TfIdfVectorizer() | ||
| tfidf_matrix = vectorizer.compute_tf(documents) | ||
| print("Vocabulary:", vectorizer.vocab) | ||
| print("TF-IDF Matrix:\n", tfidf_matrix) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As there is no test file in this pull request nor any test function or class in the file
machine_learning/feature_extraction/tf-idf.py, please provide doctest for the functiondecomposePlease provide return type hint for the function:
decompose. If the function does not return a value, please provide the type hint as:def function() -> None:Please provide type hint for the parameter:
text