ilk commit
This commit is contained in:
154
src/DownloadManager.Core/Grabber/SiteGrabber.cs
Normal file
154
src/DownloadManager.Core/Grabber/SiteGrabber.cs
Normal file
@@ -0,0 +1,154 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Net.Http;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Text.RegularExpressions;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using HtmlAgilityPack;
|
||||
|
||||
namespace DownloadManager.Core.Grabber;
|
||||
|
||||
public class GrabberOptions
|
||||
{
|
||||
public string StartUrl { get; set; } = string.Empty;
|
||||
public int MaxDepth { get; set; } = 2;
|
||||
public bool StayOnDomain { get; set; } = true;
|
||||
public string[] FileExtensions { get; set; } = Array.Empty<string>();
|
||||
public string? UrlPattern { get; set; }
|
||||
public long? MinFileSizeBytes { get; set; }
|
||||
public long? MaxFileSizeBytes { get; set; }
|
||||
public int MaxFileCount { get; set; } = 500;
|
||||
}
|
||||
|
||||
public record GrabberResult(string Url, string FileName, long? SizeBytes, string Extension);
|
||||
|
||||
public class SiteGrabber
|
||||
{
|
||||
private readonly IHttpClientFactory _httpClientFactory;
|
||||
|
||||
public SiteGrabber(IHttpClientFactory httpClientFactory)
|
||||
{
|
||||
_httpClientFactory = httpClientFactory;
|
||||
}
|
||||
|
||||
public async IAsyncEnumerable<GrabberResult> GrabAsync(
|
||||
GrabberOptions opts,
|
||||
[EnumeratorCancellation] CancellationToken ct)
|
||||
{
|
||||
var visited = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
var queue = new Queue<(string Url, int Depth)>();
|
||||
queue.Enqueue((opts.StartUrl, 0));
|
||||
|
||||
var baseUri = new Uri(opts.StartUrl);
|
||||
var found = 0;
|
||||
|
||||
while (queue.Count > 0 && !ct.IsCancellationRequested && found < opts.MaxFileCount)
|
||||
{
|
||||
var (url, depth) = queue.Dequeue();
|
||||
if (!visited.Add(url)) continue;
|
||||
|
||||
string html;
|
||||
try
|
||||
{
|
||||
var handler = new HttpClientHandler { AllowAutoRedirect = true };
|
||||
using var client = new HttpClient(handler);
|
||||
client.DefaultRequestHeaders.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0");
|
||||
html = await client.GetStringAsync(url, ct);
|
||||
}
|
||||
catch { continue; }
|
||||
|
||||
var doc = new HtmlDocument();
|
||||
doc.LoadHtml(html);
|
||||
|
||||
var nodes = doc.DocumentNode.SelectNodes("//a[@href]");
|
||||
if (nodes == null) continue;
|
||||
|
||||
foreach (var node in nodes)
|
||||
{
|
||||
if (ct.IsCancellationRequested) break;
|
||||
|
||||
var href = node.GetAttributeValue("href", "");
|
||||
if (string.IsNullOrEmpty(href)) continue;
|
||||
|
||||
if (!Uri.TryCreate(baseUri, href, out var absoluteUri)) continue;
|
||||
var absoluteUrl = absoluteUri.AbsoluteUri;
|
||||
|
||||
if (IsDownloadTarget(absoluteUrl, opts))
|
||||
{
|
||||
found++;
|
||||
long? size = await GetFileSizeRobustAsync(absoluteUrl, ct);
|
||||
|
||||
yield return new GrabberResult(
|
||||
absoluteUrl,
|
||||
Path.GetFileName(absoluteUri.LocalPath),
|
||||
size,
|
||||
Path.GetExtension(absoluteUrl).TrimStart('.').ToLower()
|
||||
);
|
||||
}
|
||||
else if (depth < opts.MaxDepth && IsSameDomain(absoluteUrl, baseUri.Host))
|
||||
{
|
||||
queue.Enqueue((absoluteUrl, depth + 1));
|
||||
}
|
||||
|
||||
if (found >= opts.MaxFileCount) break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<long?> GetFileSizeRobustAsync(string url, CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
var handler = new HttpClientHandler { AllowAutoRedirect = true };
|
||||
using var client = new HttpClient(handler);
|
||||
client.DefaultRequestHeaders.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36");
|
||||
client.Timeout = TimeSpan.FromSeconds(6);
|
||||
|
||||
// 1. Önce HEAD dene
|
||||
using var headReq = new HttpRequestMessage(HttpMethod.Head, url);
|
||||
using var headResp = await client.SendAsync(headReq, HttpCompletionOption.ResponseHeadersRead, ct);
|
||||
|
||||
// Eğer boyut 500 bayttan büyükse gerçektir, değilse muhtemelen sahte redirect sayfasıdır
|
||||
if (headResp.IsSuccessStatusCode && headResp.Content.Headers.ContentLength > 500)
|
||||
{
|
||||
return headResp.Content.Headers.ContentLength;
|
||||
}
|
||||
|
||||
// 2. HEAD başarısızsa veya 146B gibi gelmişse, Range ile GET dene
|
||||
using var getReq = new HttpRequestMessage(HttpMethod.Get, url);
|
||||
getReq.Headers.Range = new System.Net.Http.Headers.RangeHeaderValue(0, 0);
|
||||
using var getResp = await client.SendAsync(getReq, HttpCompletionOption.ResponseHeadersRead, ct);
|
||||
|
||||
if (getResp.Content.Headers.ContentRange?.HasLength == true)
|
||||
{
|
||||
return getResp.Content.Headers.ContentRange.Length;
|
||||
}
|
||||
|
||||
var fallback = getResp.Content.Headers.ContentLength;
|
||||
return fallback > 500 ? fallback : null;
|
||||
}
|
||||
catch { return null; }
|
||||
}
|
||||
|
||||
private bool IsDownloadTarget(string url, GrabberOptions opts)
|
||||
{
|
||||
var ext = Path.GetExtension(url).TrimStart('.').ToLower();
|
||||
if (string.IsNullOrEmpty(ext)) return false;
|
||||
var skip = new[] { "html", "htm", "php", "aspx", "jsp", "txt" };
|
||||
if (skip.Contains(ext)) return false;
|
||||
return opts.FileExtensions.Length == 0 || opts.FileExtensions.Contains(ext);
|
||||
}
|
||||
|
||||
private bool IsSameDomain(string url, string host)
|
||||
{
|
||||
try
|
||||
{
|
||||
var uri = new Uri(url);
|
||||
return uri.Host.Equals(host, StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
catch { return false; }
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user