99# See https://aboutcode.org for more information about nexB OSS projects.
1010#
1111
12+ import os
13+ from urllib .parse import urlparse , urlunparse
14+
1215from typing import Dict
1316from typing import List
1417from typing import Optional
2730
2831
2932async def get_pypi_data_from_purl (
30- purl : str , environment : Environment , repos : List [PypiSimpleRepository ], prefer_source : bool
33+ purl : str ,
34+ environment : Environment ,
35+ repos : List [PypiSimpleRepository ],
36+ prefer_source : bool ,
37+ index_urls : List [str ],
3138) -> Optional [PackageData ]:
3239 """
3340 Generate `Package` object from the `purl` string of pypi type
@@ -43,7 +50,22 @@ async def get_pypi_data_from_purl(
4350 version = parsed_purl .version
4451 if not version :
4552 raise Exception ("Version is not specified in the purl" )
46- base_path = "https://pypi.org/pypi"
53+
54+ # Todo: address the case where several index URLs are passed
55+ if index_urls :
56+ # Backward compatibility: If pypi.org is passed as index url, always resolve against it.
57+ # When multiple index URLs are supported and the todo above is fixed, then this hack can be removed.
58+ if "https://pypi.org/simple" in index_urls :
59+ index_url = None
60+ else :
61+ index_url = index_urls [0 ]
62+ else :
63+ index_url = None
64+
65+ base_path = (
66+ index_url .removesuffix ("/simple" ) + "/pypi" if index_url else "https://pypi.org/pypi"
67+ )
68+
4769 api_url = f"{ base_path } /{ name } /{ version } /json"
4870
4971 from python_inspector .utils import get_response_async
@@ -62,10 +84,32 @@ async def get_pypi_data_from_purl(
6284 sdist_url = await get_sdist_download_url (
6385 purl = parsed_purl , repos = repos , python_version = python_version
6486 )
87+
88+ def canonicalize_url (url : str ):
89+ # Parse the URL into its components
90+ parsed = urlparse (url )
91+
92+ # Canonicalize the path component to resolve ".."
93+ # os.path.normpath will handle segments like '.' and '..'
94+ canonical_path = os .path .normpath (parsed .path )
95+
96+ # On Windows, normpath uses backslashes ('\\').
97+ # We must replace them with forward slashes ('/') for a valid URL path.
98+ if os .path .sep == "\\ " :
99+ canonical_path = canonical_path .replace ("\\ " , "/" )
100+
101+ # Rebuild the URL with the canonicalized path
102+ # We replace the original path with the new one
103+ parsed = parsed ._replace (path = canonical_path )
104+ canonical_url = urlunparse (parsed )
105+
106+ return canonical_url
107+
65108 if sdist_url :
66109 valid_distribution_urls .append (sdist_url )
67110
68111 valid_distribution_urls = [url for url in valid_distribution_urls if url ]
112+ valid_distribution_urls = list (map (canonicalize_url , valid_distribution_urls ))
69113
70114 # if prefer_source is True then only source distribution is used
71115 # in case of no source distribution available then wheel is used
@@ -81,28 +125,60 @@ async def get_pypi_data_from_purl(
81125 ]
82126 wheel_url = choose_single_wheel (wheel_urls )
83127 if wheel_url :
84- valid_distribution_urls .insert (0 , wheel_url )
128+ valid_distribution_urls .insert (0 , canonicalize_url ( wheel_url ) )
85129
86130 urls = {url .get ("url" ): url for url in response .get ("urls" ) or []}
131+
132+ # Sanitize all URLs that are relative and canonicalize them
133+ urls_sanitized = {}
134+ for url in urls :
135+ value = urls .get (url )
136+
137+ # remove the URL anchor fragment
138+ url_parsed = urlparse (url )
139+ url = urlunparse (url_parsed ._replace (fragment = "" ))
140+
141+ if url .startswith ("https" ):
142+ url_sanitized = canonicalize_url (url )
143+ else :
144+ url_sanitized = canonicalize_url (base_path + url )
145+
146+ urls_sanitized [url_sanitized ] = value
147+
148+ def remove_credentials_from_url (url : str ):
149+ # Parse the URL into its components
150+ parsed = urlparse (url )
151+
152+ new_netloc = parsed .hostname
153+ if parsed .port :
154+ new_netloc += f":{ parsed .port } "
155+
156+ # Create a new parsed result object, replacing the old netloc
157+ # with our new one that has no credentials.
158+ parsed = parsed ._replace (netloc = new_netloc )
159+ url_without_credentials = urlunparse (parsed )
160+
161+ return url_without_credentials
162+
87163 # iterate over the valid distribution urls and return the first
88164 # one that is matching.
89165 for dist_url in valid_distribution_urls :
90- if dist_url not in urls :
166+ if dist_url not in urls_sanitized :
91167 continue
92168
93- url_data = urls .get (dist_url )
169+ url_data = urls_sanitized .get (dist_url )
94170 digests = url_data .get ("digests" ) or {}
95171
96172 return PackageData (
97173 primary_language = "Python" ,
98174 description = get_description (info ),
99175 homepage_url = homepage_url ,
100- api_data_url = api_url ,
176+ api_data_url = remove_credentials_from_url ( api_url ) ,
101177 bug_tracking_url = bug_tracking_url ,
102178 code_view_url = code_view_url ,
103179 license_expression = info .get ("license_expression" ),
104180 declared_license = get_declared_license (info ),
105- download_url = dist_url ,
181+ download_url = remove_credentials_from_url ( dist_url ) ,
106182 size = url_data .get ("size" ),
107183 md5 = digests .get ("md5" ) or url_data .get ("md5_digest" ),
108184 sha256 = digests .get ("sha256" ),
0 commit comments