mirea-projects/Third term/Discrete math/1.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "import math\n",
    "import string\n",
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Этап 1: Чтение текста из файла"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('some_text.txt', 'r', encoding='utf-8') as file:\n",
    "    text = file.read()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Этап 2: Привести все слова к нижнему регистру, удалить знаки препинания и пробелы"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = text.lower()  # Приведение к нижнему регистру\n",
    "text = text.translate(str.maketrans('', '', string.punctuation + '\\n'))  # Удаление знаков препинания\n",
    "text = text.replace(\" \", \"\")  # Удаление пробелов"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('new_text.txt', 'w', encoding='utf-8') as file:\n",
    "    file.write(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Этап 3: Подсчитать частоту появления однобуквенных и двухбуквенных сочетаний"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_frequencies(text):\n",
    "    single_char_freq = Counter(text)  # Частоты однобуквенных сочетаний\n",
    "    bigrams = [text[i:i+2] for i in range(len(text)-1)]  # Двухбуквенные сочетания\n",
    "    bigram_freq = Counter(bigrams)\n",
    "    return single_char_freq, bigram_freq"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "single_char_freq, bigram_freq = calculate_frequencies(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Этап 4: Определение энтропии"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_entropy(freq_dict, total_count):\n",
    "    entropy = 0\n",
    "    for freq in freq_dict.values():\n",
    "        p = freq / total_count\n",
    "        entropy -= p * math.log2(p)\n",
    "    return entropy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Энтропия на одну букву: 3.9961505474490147\n",
      "Энтропия на двухбуквенное сочетание: 7.389736513198184\n"
     ]
    }
   ],
   "source": [
    "single_char_entropy = calculate_entropy(single_char_freq, len(text))\n",
    "bigram_entropy = calculate_entropy(bigram_freq, len(text)-1)\n",
    "\n",
    "print(f\"Энтропия на одну букву: {single_char_entropy}\")\n",
    "print(f\"Энтропия на двухбуквенное сочетание: {bigram_entropy}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Этап 5: Длина кода и избыточность"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Длина кода при равномерном кодировании: 4.459431618637297\n",
      "Избыточность: 0.46328107118828266\n"
     ]
    }
   ],
   "source": [
    "alphabet_size = len(single_char_freq)\n",
    "uniform_code_length = math.log2(alphabet_size)\n",
    "redundancy = uniform_code_length - single_char_entropy\n",
    "\n",
    "print(f\"Длина кода при равномерном кодировании: {uniform_code_length}\")\n",
    "print(f\"Избыточность: {redundancy}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Этап 6: Удаление 20% наиболее частых символов"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Энтропия после удаления 20% самых частых символов: 3.7022435891926633\n"
     ]
    }
   ],
   "source": [
    "num_to_remove = int(len(single_char_freq) * 0.2)\n",
    "most_common = [char for char, _ in single_char_freq.most_common(num_to_remove)]\n",
    "text_without_frequent = ''.join([char for char in text if char not in most_common])\n",
    "\n",
    "new_single_char_freq, _ = calculate_frequencies(text_without_frequent)\n",
    "new_single_char_entropy = calculate_entropy(new_single_char_freq, len(text_without_frequent))\n",
    "\n",
    "print(f\"Энтропия после удаления 20% самых частых символов: {new_single_char_entropy}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Этап 7: Удаление 20% наиболее редких символов"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Энтропия после удаления 20% самых редких символов: 3.9045244598725413\n"
     ]
    }
   ],
   "source": [
    "num_to_remove = int(len(single_char_freq) * 0.2)\n",
    "least_common = [char for char, _ in single_char_freq.most_common()[:-num_to_remove-1:-1]]\n",
    "text_without_rare = ''.join([char for char in text if char not in least_common])\n",
    "\n",
    "new_single_char_freq, _ = calculate_frequencies(text_without_rare)\n",
    "new_single_char_entropy = calculate_entropy(new_single_char_freq, len(text_without_rare))\n",
    "\n",
    "print(f\"Энтропия после удаления 20% самых редких символов: {new_single_char_entropy}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
first commit 2024-09-23 23:22:33 +00:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "code",`
			`"execution_count": 25,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"import math\n",`
			`"import string\n",`
			`"from collections import Counter"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"Этап 1: Чтение текста из файла"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 26,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"with open('some_text.txt', 'r', encoding='utf-8') as file:\n",`
			`" text = file.read()"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"Этап 2: Привести все слова к нижнему регистру, удалить знаки препинания и пробелы"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 27,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"text = text.lower() # Приведение к нижнему регистру\n",`
			`"text = text.translate(str.maketrans('', '', string.punctuation + '\\n')) # Удаление знаков препинания\n",`
			`"text = text.replace(\" \", \"\") # Удаление пробелов"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 28,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"with open('new_text.txt', 'w', encoding='utf-8') as file:\n",`
			`" file.write(text)"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"Этап 3: Подсчитать частоту появления однобуквенных и двухбуквенных сочетаний"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 29,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def calculate_frequencies(text):\n",`
			`" single_char_freq = Counter(text) # Частоты однобуквенных сочетаний\n",`
			`" bigrams = [text[i:i+2] for i in range(len(text)-1)] # Двухбуквенные сочетания\n",`
			`" bigram_freq = Counter(bigrams)\n",`
			`" return single_char_freq, bigram_freq"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 30,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"single_char_freq, bigram_freq = calculate_frequencies(text)"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"Этап 4: Определение энтропии"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 31,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def calculate_entropy(freq_dict, total_count):\n",`
			`" entropy = 0\n",`
			`" for freq in freq_dict.values():\n",`
			`" p = freq / total_count\n",`
			`" entropy -= p * math.log2(p)\n",`
			`" return entropy"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 32,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"Энтропия на одну букву: 3.9961505474490147\n",`
			`"Энтропия на двухбуквенное сочетание: 7.389736513198184\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"single_char_entropy = calculate_entropy(single_char_freq, len(text))\n",`
			`"bigram_entropy = calculate_entropy(bigram_freq, len(text)-1)\n",`
			`"\n",`
			`"print(f\"Энтропия на одну букву: {single_char_entropy}\")\n",`
			`"print(f\"Энтропия на двухбуквенное сочетание: {bigram_entropy}\")"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"Этап 5: Длина кода и избыточность"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 33,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"Длина кода при равномерном кодировании: 4.459431618637297\n",`
			`"Избыточность: 0.46328107118828266\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"alphabet_size = len(single_char_freq)\n",`
			`"uniform_code_length = math.log2(alphabet_size)\n",`
			`"redundancy = uniform_code_length - single_char_entropy\n",`
			`"\n",`
			`"print(f\"Длина кода при равномерном кодировании: {uniform_code_length}\")\n",`
			`"print(f\"Избыточность: {redundancy}\")"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"Этап 6: Удаление 20% наиболее частых символов"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 34,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"Энтропия после удаления 20% самых частых символов: 3.7022435891926633\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"num_to_remove = int(len(single_char_freq) * 0.2)\n",`
			`"most_common = [char for char, _ in single_char_freq.most_common(num_to_remove)]\n",`
			`"text_without_frequent = ''.join([char for char in text if char not in most_common])\n",`
			`"\n",`
			`"new_single_char_freq, _ = calculate_frequencies(text_without_frequent)\n",`
			`"new_single_char_entropy = calculate_entropy(new_single_char_freq, len(text_without_frequent))\n",`
			`"\n",`
			`"print(f\"Энтропия после удаления 20% самых частых символов: {new_single_char_entropy}\")"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"Этап 7: Удаление 20% наиболее редких символов"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 35,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"Энтропия после удаления 20% самых редких символов: 3.9045244598725413\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"num_to_remove = int(len(single_char_freq) * 0.2)\n",`
			`"least_common = [char for char, _ in single_char_freq.most_common()[:-num_to_remove-1:-1]]\n",`
			`"text_without_rare = ''.join([char for char in text if char not in least_common])\n",`
			`"\n",`
			`"new_single_char_freq, _ = calculate_frequencies(text_without_rare)\n",`
			`"new_single_char_entropy = calculate_entropy(new_single_char_freq, len(text_without_rare))\n",`
			`"\n",`
			`"print(f\"Энтропия после удаления 20% самых редких символов: {new_single_char_entropy}\")"`
			`]`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "Python 3",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.11.0"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 2`
			`}`