mirea-projects/Third term/Discrete math/1.ipynb

246 lines
6.8 KiB
Plaintext
Raw Permalink Normal View History

2024-09-23 23:22:33 +00:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"import math\n",
"import string\n",
"from collections import Counter"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Этап 1: Чтение текста из файла"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"with open('some_text.txt', 'r', encoding='utf-8') as file:\n",
" text = file.read()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Этап 2: Привести все слова к нижнему регистру, удалить знаки препинания и пробелы"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"text = text.lower() # Приведение к нижнему регистру\n",
"text = text.translate(str.maketrans('', '', string.punctuation + '\\n')) # Удаление знаков препинания\n",
"text = text.replace(\" \", \"\") # Удаление пробелов"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"with open('new_text.txt', 'w', encoding='utf-8') as file:\n",
" file.write(text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Этап 3: Подсчитать частоту появления однобуквенных и двухбуквенных сочетаний"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"def calculate_frequencies(text):\n",
" single_char_freq = Counter(text) # Частоты однобуквенных сочетаний\n",
" bigrams = [text[i:i+2] for i in range(len(text)-1)] # Двухбуквенные сочетания\n",
" bigram_freq = Counter(bigrams)\n",
" return single_char_freq, bigram_freq"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"single_char_freq, bigram_freq = calculate_frequencies(text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Этап 4: Определение энтропии"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"def calculate_entropy(freq_dict, total_count):\n",
" entropy = 0\n",
" for freq in freq_dict.values():\n",
" p = freq / total_count\n",
" entropy -= p * math.log2(p)\n",
" return entropy"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Энтропия на одну букву: 3.9961505474490147\n",
"Энтропия на двухбуквенное сочетание: 7.389736513198184\n"
]
}
],
"source": [
"single_char_entropy = calculate_entropy(single_char_freq, len(text))\n",
"bigram_entropy = calculate_entropy(bigram_freq, len(text)-1)\n",
"\n",
"print(f\"Энтропия на одну букву: {single_char_entropy}\")\n",
"print(f\"Энтропия на двухбуквенное сочетание: {bigram_entropy}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Этап 5: Длина кода и избыточность"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Длина кода при равномерном кодировании: 4.459431618637297\n",
"Избыточность: 0.46328107118828266\n"
]
}
],
"source": [
"alphabet_size = len(single_char_freq)\n",
"uniform_code_length = math.log2(alphabet_size)\n",
"redundancy = uniform_code_length - single_char_entropy\n",
"\n",
"print(f\"Длина кода при равномерном кодировании: {uniform_code_length}\")\n",
"print(f\"Избыточность: {redundancy}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Этап 6: Удаление 20% наиболее частых символов"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Энтропия после удаления 20% самых частых символов: 3.7022435891926633\n"
]
}
],
"source": [
"num_to_remove = int(len(single_char_freq) * 0.2)\n",
"most_common = [char for char, _ in single_char_freq.most_common(num_to_remove)]\n",
"text_without_frequent = ''.join([char for char in text if char not in most_common])\n",
"\n",
"new_single_char_freq, _ = calculate_frequencies(text_without_frequent)\n",
"new_single_char_entropy = calculate_entropy(new_single_char_freq, len(text_without_frequent))\n",
"\n",
"print(f\"Энтропия после удаления 20% самых частых символов: {new_single_char_entropy}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Этап 7: Удаление 20% наиболее редких символов"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Энтропия после удаления 20% самых редких символов: 3.9045244598725413\n"
]
}
],
"source": [
"num_to_remove = int(len(single_char_freq) * 0.2)\n",
"least_common = [char for char, _ in single_char_freq.most_common()[:-num_to_remove-1:-1]]\n",
"text_without_rare = ''.join([char for char in text if char not in least_common])\n",
"\n",
"new_single_char_freq, _ = calculate_frequencies(text_without_rare)\n",
"new_single_char_entropy = calculate_entropy(new_single_char_freq, len(text_without_rare))\n",
"\n",
"print(f\"Энтропия после удаления 20% самых редких символов: {new_single_char_entropy}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}