Spaces:

ysakhale
/

python-dependency-compatibility-board

Sleeping

python-dependency-compatibility-board / Synthetic data.py

Yash Sakhale

Initial deployment of Python Dependency Compatibility Board

1d39202 about 1 month ago

6.3 kB

	import json
	import random
	from pathlib import Path

	random.seed(0)

	# Simple package + version catalog
	PKG_VERSIONS = {
	"numpy": ["1.21.0", "1.22.0", "1.23.5"],
	"pandas": ["1.3.5", "1.4.4", "2.0.3"],
	"scipy": ["1.7.3", "1.8.1", "1.10.0"],
	"scikit-learn": ["0.24.2", "1.0.2", "1.2.2"],
	"torch": ["1.8.0", "1.13.1", "2.1.0"],
	"torchvision": ["0.9.0", "0.14.1", "0.16.0"],
	"torchaudio": ["0.8.0", "0.13.1", "2.1.0"],
	"pytorch-lightning": ["1.5.0", "2.0.0", "2.2.0"],
	"tensorflow": ["1.15.0", "2.9.0", "2.15.0"],
	"keras": ["2.4.0", "2.9.0", "3.0.0"],
	"jax": ["0.3.25", "0.4.13"],
	"flax": ["0.5.1", "0.7.2"],
	"fastapi": ["0.78.0", "0.99.0"],
	"uvicorn[standard]": ["0.17.6", "0.23.2"],
	"starlette": ["0.19.1", "0.27.0"],
	"pydantic": ["1.10.13", "2.3.0"],
	"sqlalchemy": ["1.4.46", "2.0.20"],
	"alembic": ["1.7.7", "1.12.0"],
	"psycopg2-binary": ["2.9.3"],
	"requests": ["2.27.1", "2.31.0"],
	"httpx": ["0.23.0", "0.25.1"],
	"beautifulsoup4": ["4.10.0", "4.12.2"],
	"scrapy": ["2.5.1", "2.9.0"],
	"opencv-python": ["4.5.5.64", "4.8.0.76"],
	"pillow": ["9.0.1", "10.0.0"],
	"matplotlib": ["3.5.1", "3.7.2"],
	"seaborn": ["0.11.2", "0.13.0"],
	"plotly": ["5.6.0", "5.17.0"],
	"langchain": ["0.0.350", "0.1.0"],
	"openai": ["0.28.0", "1.6.0"],
	"tiktoken": ["0.5.1"],
	"chromadb": ["0.4.8", "0.4.23"],
	"weaviate-client": ["3.21.0"],
	"redis": ["4.3.4", "5.0.1"],
	"celery": ["5.2.7", "5.3.4"],
	"gunicorn": ["20.1.0"],
	"uvloop": ["0.17.0"],
	}

	PKG_NAMES = list(PKG_VERSIONS.keys())


	def make_requirements(num_lines: int, force_conflict: bool = False):
	"""
	Create one synthetic requirements.txt-style env.
	Some are valid, some invalid.
	"""
	chosen = random.sample(PKG_NAMES, num_lines)
	req_lines = []
	pinned_versions = {}

	# Basic random env
	for pkg in chosen:
	ver = random.choice(PKG_VERSIONS[pkg])
	pinned_versions[pkg] = ver
	# Sometimes no exact pin
	if random.random() < 0.2:
	line = pkg
	else:
	line = f"{pkg}=={ver}"
	req_lines.append(line)

	label = "valid"
	conflict_reason = None

	# Rule 1: torch & pytorch-lightning conflict
	# synthetic rule: torch<2.0 with pl>=2.0 is "invalid"
	if "torch" in pinned_versions and "pytorch-lightning" in pinned_versions:
	tver = pinned_versions["torch"]
	plver = pinned_versions["pytorch-lightning"]
	if force_conflict or (random.random() < 0.5 and tver.startswith("1.") and plver.startswith("2.")):
	# enforce explicit problematic pins
	for i, line in enumerate(req_lines):
	if line.startswith("torch"):
	req_lines[i] = "torch==1.8.0"
	if line.startswith("pytorch-lightning"):
	req_lines[i] = "pytorch-lightning==2.2.0"
	label = "invalid"
	conflict_reason = "pytorch-lightning>=2.0 is assumed to require torch>=2.0 but torch==1.8.0 is pinned."

	# Rule 2: tensorflow 1.15 with keras 3.0
	if label == "valid" and "tensorflow" in pinned_versions and "keras" in pinned_versions:
	tver = pinned_versions["tensorflow"]
	kver = pinned_versions["keras"]
	if force_conflict or (random.random() < 0.5 and tver.startswith("1.") and kver.startswith("3.")):
	for i, line in enumerate(req_lines):
	if line.startswith("tensorflow"):
	req_lines[i] = "tensorflow==1.15.0"
	if line.startswith("keras"):
	req_lines[i] = "keras==3.0.0"
	label = "invalid"
	conflict_reason = "keras==3.0.0 is assumed to require TensorFlow 2.x but tensorflow==1.15.0 is pinned."

	# Rule 3: old fastapi with pydantic v2
	if label == "valid" and "fastapi" in pinned_versions and "pydantic" in pinned_versions:
	fver = pinned_versions["fastapi"]
	pver = pinned_versions["pydantic"]
	# synthetic rule: fastapi 0.78 with pydantic 2.x is invalid
	if force_conflict or (random.random() < 0.5 and fver.startswith("0.78") and pver.startswith("2.")):
	for i, line in enumerate(req_lines):
	if line.startswith("fastapi"):
	req_lines[i] = "fastapi==0.78.0"
	if line.startswith("pydantic"):
	req_lines[i] = "pydantic==2.3.0"
	label = "invalid"
	conflict_reason = "fastapi==0.78.0 is assumed to require pydantic v1, but pydantic==2.3.0 is pinned."

	# Rule 4: generic conflict – same package pinned twice to different versions
	if label == "valid" and force_conflict:
	pkg = chosen[0]
	existing_ver = pinned_versions[pkg]
	alt_candidates = [v for v in PKG_VERSIONS[pkg] if v != existing_ver]
	if alt_candidates:
	alt_ver = random.choice(alt_candidates)
	else:
	alt_ver = existing_ver
	req_lines.append(f"{pkg}=={alt_ver}")
	label = "invalid"
	conflict_reason = f"{pkg} is pinned to multiple incompatible versions."

	return "\n".join(req_lines), label, conflict_reason


	def generate_dataset(n_samples: int = 100):
	samples = []
	for i in range(n_samples):
	num_lines = random.randint(4, 10)
	# roughly half forced invalid
	force_conflict = (i % 2 == 1)
	req_str, label, reason = make_requirements(num_lines, force_conflict=force_conflict)
	samples.append(
	{
	"id": i + 1,
	"requirements": req_str,
	"label": label,
	"conflict_reason": reason,
	}
	)
	return samples


	if __name__ == "__main__":
	samples = generate_dataset(n_samples=120) # 120 just to be safe for "at least 100"

	out_path = Path("synthetic_requirements_dataset.json")
	out_path.write_text(json.dumps(samples, indent=2))
	print(f"Wrote {len(samples)} samples to {out_path.resolve()}")

	# Optional: also write each requirements.txt separately
	base_dir = Path("synthetic_requirements_txt")
	base_dir.mkdir(exist_ok=True)
	for s in samples:
	fname = base_dir / f"requirements_{s['id']:03d}_{s['label']}.txt"
	fname.write_text(s["requirements"])