mirea-projects/Third term/Discrete math/1.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "import math\n",
    "import string\n",
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Этап 1: Чтение текста из файла"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('some_text.txt', 'r', encoding='utf-8') as file:\n",
    "    text = file.read()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Этап 2: Привести все слова к нижнему регистру, удалить знаки препинания и пробелы"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = text.lower()  # Приведение к нижнему регистру\n",
    "text = text.translate(str.maketrans('', '', string.punctuation + '\\n'))  # Удаление знаков препинания\n",
    "text = text.replace(\" \", \"\")  # Удаление пробелов"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('new_text.txt', 'w', encoding='utf-8') as file:\n",
    "    file.write(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Этап 3: Подсчитать частоту появления однобуквенных и двухбуквенных сочетаний"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_frequencies(text):\n",
    "    single_char_freq = Counter(text)  # Частоты однобуквенных сочетаний\n",
    "    bigrams = [text[i:i+2] for i in range(len(text)-1)]  # Двухбуквенные сочетания\n",
    "    bigram_freq = Counter(bigrams)\n",
    "    return single_char_freq, bigram_freq"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "single_char_freq, bigram_freq = calculate_frequencies(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Этап 4: Определение энтропии"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_entropy(freq_dict, total_count):\n",
    "    entropy = 0\n",
    "    for freq in freq_dict.values():\n",
    "        p = freq / total_count\n",
    "        entropy -= p * math.log2(p)\n",
    "    return entropy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Энтропия на одну букву: 3.9961505474490147\n",
      "Энтропия на двухбуквенное сочетание: 7.389736513198184\n"
     ]
    }
   ],
   "source": [
    "single_char_entropy = calculate_entropy(single_char_freq, len(text))\n",
    "bigram_entropy = calculate_entropy(bigram_freq, len(text)-1)\n",
    "\n",
    "print(f\"Энтропия на одну букву: {single_char_entropy}\")\n",
    "print(f\"Энтропия на двухбуквенное сочетание: {bigram_entropy}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Этап 5: Длина кода и избыточность"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Длина кода при равномерном кодировании: 4.459431618637297\n",
      "Избыточность: 0.46328107118828266\n"
     ]
    }
   ],
   "source": [
    "alphabet_size = len(single_char_freq)\n",
    "uniform_code_length = math.log2(alphabet_size)\n",
    "redundancy = uniform_code_length - single_char_entropy\n",
    "\n",
    "print(f\"Длина кода при равномерном кодировании: {uniform_code_length}\")\n",
    "print(f\"Избыточность: {redundancy}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Этап 6: Удаление 20% наиболее частых символов"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Энтропия после удаления 20% самых частых символов: 3.7022435891926633\n"
     ]
    }
   ],
   "source": [
    "num_to_remove = int(len(single_char_freq) * 0.2)\n",
    "most_common = [char for char, _ in single_char_freq.most_common(num_to_remove)]\n",
    "text_without_frequent = ''.join([char for char in text if char not in most_common])\n",
    "\n",
    "new_single_char_freq, _ = calculate_frequencies(text_without_frequent)\n",
    "new_single_char_entropy = calculate_entropy(new_single_char_freq, len(text_without_frequent))\n",
    "\n",
    "print(f\"Энтропия после удаления 20% самых частых символов: {new_single_char_entropy}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Этап 7: Удаление 20% наиболее редких символов"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Энтропия после удаления 20% самых редких символов: 3.9045244598725413\n"
     ]
    }
   ],
   "source": [
    "num_to_remove = int(len(single_char_freq) * 0.2)\n",
    "least_common = [char for char, _ in single_char_freq.most_common()[:-num_to_remove-1:-1]]\n",
    "text_without_rare = ''.join([char for char in text if char not in least_common])\n",
    "\n",
    "new_single_char_freq, _ = calculate_frequencies(text_without_rare)\n",
    "new_single_char_entropy = calculate_entropy(new_single_char_freq, len(text_without_rare))\n",
    "\n",
    "print(f\"Энтропия после удаления 20% самых редких символов: {new_single_char_entropy}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}