forked from menault/TD2_DEV51_Qualite_Algo
242 lines
8.4 KiB
Python
242 lines
8.4 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Simulation du processus de création de liens entre pages web sous l'influence de moteurs de recherche,
|
|
conformément à l'énoncé "Modélisations mathématiques — BUT3 Info Fontainebleau".
|
|
|
|
Fonctionnalités :
|
|
- Initialisation d'un graphe orienté de n pages avec pertinence p(x) dans [0,1]
|
|
- Scores de recherche au choix : "indegree_plus_one" (degré entrant + 1, normalisé) ou "pagerank"
|
|
- À chaque itération : on choisit x au hasard, puis y selon la distribution des scores s(y),
|
|
puis on ajoute le lien x->y avec probabilité p(y)
|
|
- Politiques : interdiction des boucles x->x, politique de remplacement optionnelle
|
|
- Arrêt : quand chaque page a au moins ceil(ln n) liens sortants OU stagnation
|
|
- Visualisation : graphe final (taille des nœuds = score), historique des nouveaux liens
|
|
- Export CSV et PNG
|
|
"""
|
|
|
|
import numpy as np
|
|
import random
|
|
import math
|
|
import pandas as pd
|
|
import networkx as nx
|
|
import matplotlib.pyplot as plt
|
|
from dataclasses import dataclass
|
|
from typing import Optional, Tuple, Dict, List
|
|
|
|
# ----------------------------
|
|
# Configuration de simulation
|
|
# ----------------------------
|
|
|
|
@dataclass
|
|
class WebSimulationConfig:
|
|
n: int = 30
|
|
seed: Optional[int] = 42
|
|
score_mode: str = "indegree_plus_one" # "indegree_plus_one" | "pagerank"
|
|
use_realistic_p: bool = False
|
|
beta_params: Tuple[float, float] = (2.0, 5.0)
|
|
max_iters: int = 20000
|
|
min_out_links: Optional[int] = None
|
|
stagnation_patience: int = 500
|
|
allow_self_loops: bool = False
|
|
replacement_policy: str = "none" # "none" | "replace_worst_if_better"
|
|
pagerank_damping: float = 0.85
|
|
pagerank_tol: float = 1.0e-08
|
|
pagerank_max_iter: int = 100
|
|
|
|
# ----------------------------
|
|
# Simulation
|
|
# ----------------------------
|
|
|
|
class WebSimulation:
|
|
def __init__(self, config: WebSimulationConfig):
|
|
self.cfg = config
|
|
if self.cfg.seed is not None:
|
|
np.random.seed(self.cfg.seed)
|
|
random.seed(self.cfg.seed)
|
|
self.G = nx.DiGraph()
|
|
self.G.add_nodes_from(range(self.cfg.n))
|
|
if self.cfg.use_realistic_p:
|
|
a, b = self.cfg.beta_params
|
|
p_vals = np.random.beta(a, b, size=self.cfg.n)
|
|
else:
|
|
p_vals = np.random.rand(self.cfg.n)
|
|
self.p = {i: float(p_vals[i]) for i in range(self.cfg.n)}
|
|
self.scores = {i: 1.0 / self.cfg.n for i in range(self.cfg.n)}
|
|
self.min_out_links = (math.ceil(math.log(self.cfg.n)) if self.cfg.min_out_links is None
|
|
else self.cfg.min_out_links)
|
|
|
|
def _compute_scores_indegree_plus_one(self) -> Dict[int, float]:
|
|
raw = np.array([self.G.in_degree(i) + 1 for i in range(self.cfg.n)], dtype=float)
|
|
total = raw.sum()
|
|
if total <= 0:
|
|
return {i: 1.0 / self.cfg.n for i in range(self.cfg.n)}
|
|
return {i: float(raw[i] / total) for i in range(self.cfg.n)}
|
|
|
|
def _compute_scores_pagerank(self) -> Dict[int, float]:
|
|
try:
|
|
pr = nx.pagerank(self.G, alpha=self.cfg.pagerank_damping,
|
|
tol=self.cfg.pagerank_tol,
|
|
max_iter=self.cfg.pagerank_max_iter)
|
|
except nx.PowerIterationFailedConvergence:
|
|
pr = {i: 1.0 / self.cfg.n for i in range(self.cfg.n)}
|
|
return pr
|
|
|
|
def recompute_scores(self) -> None:
|
|
if self.cfg.score_mode == "indegree_plus_one":
|
|
self.scores = self._compute_scores_indegree_plus_one()
|
|
elif self.cfg.score_mode == "pagerank":
|
|
self.scores = self._compute_scores_pagerank()
|
|
else:
|
|
raise ValueError("Unknown score_mode")
|
|
|
|
def _choose_y_weighted_by_scores(self) -> int:
|
|
nodes = list(range(self.cfg.n))
|
|
weights = np.array([self.scores[i] for i in nodes], dtype=float)
|
|
weights = weights / weights.sum()
|
|
return int(np.random.choice(nodes, p=weights))
|
|
|
|
def _maybe_add_edge(self, x: int, y: int) -> bool:
|
|
if (not self.cfg.allow_self_loops) and (x == y):
|
|
return False
|
|
accept = (random.random() <= self.p[y])
|
|
if not accept:
|
|
return False
|
|
|
|
if self.cfg.replacement_policy == "none":
|
|
if not self.G.has_edge(x, y):
|
|
self.G.add_edge(x, y, weight=1.0)
|
|
return True
|
|
return False
|
|
|
|
if self.cfg.replacement_policy == "replace_worst_if_better":
|
|
out_neighbors = list(self.G.successors(x))
|
|
if y in out_neighbors:
|
|
return False
|
|
if len(out_neighbors) < self.min_out_links:
|
|
self.G.add_edge(x, y, weight=1.0)
|
|
return True
|
|
worst = min(out_neighbors, key=lambda t: self.p[t]) if out_neighbors else None
|
|
if worst is None:
|
|
self.G.add_edge(x, y, weight=1.0)
|
|
return True
|
|
if self.p[y] > self.p[worst]:
|
|
self.G.remove_edge(x, worst)
|
|
self.G.add_edge(x, y, weight=1.0)
|
|
return True
|
|
return False
|
|
|
|
raise ValueError("Unknown replacement_policy")
|
|
|
|
def _meets_stopping_condition(self, stagnation_steps: int) -> bool:
|
|
if all(self.G.out_degree(i) >= self.min_out_links for i in range(self.cfg.n)):
|
|
return True
|
|
if stagnation_steps >= self.cfg.stagnation_patience:
|
|
return True
|
|
return False
|
|
|
|
def run(self):
|
|
history_new_edges = []
|
|
stagnation = 0
|
|
iters = 0
|
|
while iters < self.cfg.max_iters:
|
|
x = random.randrange(self.cfg.n)
|
|
y = self._choose_y_weighted_by_scores()
|
|
changed = self._maybe_add_edge(x, y)
|
|
history_new_edges.append(1 if changed else 0)
|
|
if changed:
|
|
stagnation = 0
|
|
self.recompute_scores()
|
|
else:
|
|
stagnation += 1
|
|
iters += 1
|
|
if self._meets_stopping_condition(stagnation):
|
|
break
|
|
self.recompute_scores()
|
|
return self.G, self.p, self.scores, history_new_edges, iters
|
|
|
|
# ----------------------------
|
|
# Outils d'analyse & tracés
|
|
# ----------------------------
|
|
|
|
def summarize(G, p, scores) -> pd.DataFrame:
|
|
rows = []
|
|
for i in G.nodes():
|
|
rows.append({
|
|
"node": i,
|
|
"pertinence_p": p[i],
|
|
"score": scores[i],
|
|
"in_degree": G.in_degree(i),
|
|
"out_degree": G.out_degree(i),
|
|
})
|
|
return pd.DataFrame(rows).sort_values(["score", "pertinence_p"], ascending=[False, False]).reset_index(drop=True)
|
|
|
|
def plot_expected_graph(G, scores, title="Graphe attendu (taille = score)"):
|
|
# Disposition circulaire stricte
|
|
pos = nx.circular_layout(G)
|
|
|
|
# Taille des nœuds proportionnelle aux scores
|
|
score_vals = np.array([scores[i] for i in G.nodes()])
|
|
node_sizes = 3000 * (score_vals / score_vals.max() + 0.05)
|
|
|
|
plt.figure(figsize=(7, 7))
|
|
nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color="dodgerblue")
|
|
nx.draw_networkx_edges(
|
|
G, pos, arrows=True, arrowstyle="-|>", arrowsize=8,
|
|
edge_color="black", width=0.8
|
|
)
|
|
plt.axis("off")
|
|
plt.title(title)
|
|
plt.tight_layout()
|
|
plt.show()
|
|
|
|
|
|
|
|
def plot_history(history):
|
|
plt.figure(figsize=(10, 3))
|
|
plt.plot(history)
|
|
plt.title("Nouveaux liens (1) vs itérations")
|
|
plt.xlabel("Itération")
|
|
plt.ylabel("Nouveau lien ?")
|
|
plt.tight_layout()
|
|
plt.show()
|
|
|
|
# ----------------------------
|
|
# Main
|
|
# ----------------------------
|
|
|
|
def main():
|
|
cfg = WebSimulationConfig(
|
|
n=30,
|
|
seed=123,
|
|
score_mode="indegree_plus_one", # ou "pagerank"
|
|
use_realistic_p=True,
|
|
beta_params=(2.5, 4.0),
|
|
max_iters=10000,
|
|
min_out_links=None, # -> ceil(ln n)
|
|
stagnation_patience=1000,
|
|
allow_self_loops=False,
|
|
replacement_policy="replace_worst_if_better",
|
|
)
|
|
sim = WebSimulation(cfg)
|
|
G, p, scores, history, iters = sim.run()
|
|
print(f"Itérations effectuées: {iters}")
|
|
df = summarize(G, p, scores)
|
|
print(df.head(10))
|
|
# Figures
|
|
plot_expected_graph(G, scores)
|
|
plot_history(history)
|
|
# Exports
|
|
df.to_csv("resume_pages.csv", index=False)
|
|
plt.figure()
|
|
plot_expected_graph(G, scores, title="Graphe final exporté")
|
|
plt.savefig("graphe_final.png", dpi=150, bbox_inches="tight")
|
|
plt.close()
|
|
plt.figure()
|
|
plot_history(history)
|
|
plt.savefig("historique_liens.png", dpi=150, bbox_inches="tight")
|
|
plt.close()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|