{ "cells": [ { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "import math\n", "import string\n", "from collections import Counter" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Этап 1: Чтение текста из файла" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "with open('some_text.txt', 'r', encoding='utf-8') as file:\n", " text = file.read()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Этап 2: Привести все слова к нижнему регистру, удалить знаки препинания и пробелы" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "text = text.lower() # Приведение к нижнему регистру\n", "text = text.translate(str.maketrans('', '', string.punctuation + '\\n')) # Удаление знаков препинания\n", "text = text.replace(\" \", \"\") # Удаление пробелов" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "with open('new_text.txt', 'w', encoding='utf-8') as file:\n", " file.write(text)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Этап 3: Подсчитать частоту появления однобуквенных и двухбуквенных сочетаний" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "def calculate_frequencies(text):\n", " single_char_freq = Counter(text) # Частоты однобуквенных сочетаний\n", " bigrams = [text[i:i+2] for i in range(len(text)-1)] # Двухбуквенные сочетания\n", " bigram_freq = Counter(bigrams)\n", " return single_char_freq, bigram_freq" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "single_char_freq, bigram_freq = calculate_frequencies(text)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Этап 4: Определение энтропии" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "def calculate_entropy(freq_dict, total_count):\n", " entropy = 0\n", " for freq in freq_dict.values():\n", " p = freq / total_count\n", " entropy -= p * math.log2(p)\n", " return entropy" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Энтропия на одну букву: 3.9961505474490147\n", "Энтропия на двухбуквенное сочетание: 7.389736513198184\n" ] } ], "source": [ "single_char_entropy = calculate_entropy(single_char_freq, len(text))\n", "bigram_entropy = calculate_entropy(bigram_freq, len(text)-1)\n", "\n", "print(f\"Энтропия на одну букву: {single_char_entropy}\")\n", "print(f\"Энтропия на двухбуквенное сочетание: {bigram_entropy}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Этап 5: Длина кода и избыточность" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Длина кода при равномерном кодировании: 4.459431618637297\n", "Избыточность: 0.46328107118828266\n" ] } ], "source": [ "alphabet_size = len(single_char_freq)\n", "uniform_code_length = math.log2(alphabet_size)\n", "redundancy = uniform_code_length - single_char_entropy\n", "\n", "print(f\"Длина кода при равномерном кодировании: {uniform_code_length}\")\n", "print(f\"Избыточность: {redundancy}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Этап 6: Удаление 20% наиболее частых символов" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Энтропия после удаления 20% самых частых символов: 3.7022435891926633\n" ] } ], "source": [ "num_to_remove = int(len(single_char_freq) * 0.2)\n", "most_common = [char for char, _ in single_char_freq.most_common(num_to_remove)]\n", "text_without_frequent = ''.join([char for char in text if char not in most_common])\n", "\n", "new_single_char_freq, _ = calculate_frequencies(text_without_frequent)\n", "new_single_char_entropy = calculate_entropy(new_single_char_freq, len(text_without_frequent))\n", "\n", "print(f\"Энтропия после удаления 20% самых частых символов: {new_single_char_entropy}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Этап 7: Удаление 20% наиболее редких символов" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Энтропия после удаления 20% самых редких символов: 3.9045244598725413\n" ] } ], "source": [ "num_to_remove = int(len(single_char_freq) * 0.2)\n", "least_common = [char for char, _ in single_char_freq.most_common()[:-num_to_remove-1:-1]]\n", "text_without_rare = ''.join([char for char in text if char not in least_common])\n", "\n", "new_single_char_freq, _ = calculate_frequencies(text_without_rare)\n", "new_single_char_entropy = calculate_entropy(new_single_char_freq, len(text_without_rare))\n", "\n", "print(f\"Энтропия после удаления 20% самых редких символов: {new_single_char_entropy}\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.0" } }, "nbformat": 4, "nbformat_minor": 2 }